Open-Meteo maintains an API for historical weather that allows for non-commercial usage of historical weather data maintained by the website.
This file runs exploratory analysis on some of the historical weather data.
The exploration process uses tidyverse and several generic custom functions:
library(tidyverse) # tidyverse functionality is included throughout
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
source("./Generic_Added_Utility_Functions_202105_v001.R") # Basic functions
A sample of data for 365 days has been downloaded as a CSV. The downloaded data has three separate files included in a single tab, separated by a blank row. The first file is location data, the second file is hourly data, and the third file is daily data. For initial exploration, parameters specific to this file are used:
omFileLoc <- "./RInputFiles/openmeteo_20230612_example.csv"
# Location data
omLocation <- readr::read_csv(omFileLoc, n_max=1, skip=0)
## Rows: 1 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): timezone, timezone_abbreviation
## dbl (4): latitude, longitude, elevation, utc_offset_seconds
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
omLocation
## # A tibble: 1 × 6
## latitude longitude elevation utc_offset_seconds timezone timezone_abb…¹
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 41.8 -87.6 179 -18000 America/Chicago CDT
## # … with abbreviated variable name ¹timezone_abbreviation
# Hourly data
# Elements: time, 2m temp (C), 2m dew point (C), 2m relative humidity (%), precip (mm), rain (mm), and snow (cm)
omHourlyRaw <- readr::read_csv(omFileLoc, n_max=8760, skip=3)
## Rows: 8760 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): temperature_2m (°C), relativehumidity_2m (%), dewpoint_2m (°C), pr...
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
omHourlyProcess <- omHourlyRaw %>%
purrr::set_names(c("time", "temp2m_C", "relH2m", "dew2m_C", "precip_mm", "rain_mm", "snow_cm")) %>%
mutate(date=date(time))
omHourlyProcess
## # A tibble: 8,760 × 8
## time temp2…¹ relH2m dew2m_C preci…² rain_mm snow_cm date
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
## 1 2022-06-08 00:00:00 13.4 91 11.9 0 0 0 2022-06-08
## 2 2022-06-08 01:00:00 13.4 91 12 0 0 0 2022-06-08
## 3 2022-06-08 02:00:00 13.8 87 11.8 0 0 0 2022-06-08
## 4 2022-06-08 03:00:00 13.8 87 11.7 0 0 0 2022-06-08
## 5 2022-06-08 04:00:00 14 85 11.6 0 0 0 2022-06-08
## 6 2022-06-08 05:00:00 14.4 82 11.3 0 0 0 2022-06-08
## 7 2022-06-08 06:00:00 14.8 79 11.1 0 0 0 2022-06-08
## 8 2022-06-08 07:00:00 15.1 77 11.1 0.1 0.1 0 2022-06-08
## 9 2022-06-08 08:00:00 15.7 75 11.2 0 0 0 2022-06-08
## 10 2022-06-08 09:00:00 16.4 72 11.3 0 0 0 2022-06-08
## # … with 8,750 more rows, and abbreviated variable names ¹temp2m_C, ²precip_mm
summary(omHourlyProcess)
## time temp2m_C relH2m
## Min. :2022-06-08 00:00:00 Min. :-20.60 Min. : 32.00
## 1st Qu.:2022-09-07 05:45:00 1st Qu.: 2.80 1st Qu.: 62.00
## Median :2022-12-07 11:30:00 Median : 10.30 Median : 72.00
## Mean :2022-12-07 11:30:00 Mean : 10.81 Mean : 72.38
## 3rd Qu.:2023-03-08 17:15:00 3rd Qu.: 19.80 3rd Qu.: 83.00
## Max. :2023-06-07 23:00:00 Max. : 31.50 Max. :100.00
## NA's :53 NA's :53
## dew2m_C precip_mm rain_mm snow_cm
## Min. :-24.300 Min. : 0.00000 Min. : 0.00000 Min. :0.00000
## 1st Qu.: -1.400 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.:0.00000
## Median : 5.500 Median : 0.00000 Median : 0.00000 Median :0.00000
## Mean : 5.792 Mean : 0.09986 Mean : 0.09167 Mean :0.00573
## 3rd Qu.: 14.700 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.:0.00000
## Max. : 24.200 Max. :11.10000 Max. :11.10000 Max. :1.26000
## NA's :53 NA's :53 NA's :53 NA's :53
## date
## Min. :2022-06-08
## 1st Qu.:2022-09-07
## Median :2022-12-07
## Mean :2022-12-07
## 3rd Qu.:2023-03-08
## Max. :2023-06-07
##
# Daily data
# Elements: date, sum of precip (mm), sum of rain (mm), and sum of snow (cm)
omDailyRaw <- readr::read_csv(omFileLoc, n_max=365, skip=8765)
## Rows: 365 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): precipitation_sum (mm), rain_sum (mm), snowfall_sum (cm)
## date (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
omDailyProcess <- omDailyRaw %>%
purrr::set_names(c("date", "precip_mm", "rain_mm", "snow_cm"))
omDailyProcess
## # A tibble: 365 × 4
## date precip_mm rain_mm snow_cm
## <date> <dbl> <dbl> <dbl>
## 1 2022-06-08 16 16 0
## 2 2022-06-09 0 0 0
## 3 2022-06-10 0.6 0.6 0
## 4 2022-06-11 0 0 0
## 5 2022-06-12 1.3 1.3 0
## 6 2022-06-13 2.6 2.6 0
## 7 2022-06-14 0 0 0
## 8 2022-06-15 0 0 0
## 9 2022-06-16 9.5 9.5 0
## 10 2022-06-17 0 0 0
## # … with 355 more rows
summary(omDailyProcess)
## date precip_mm rain_mm snow_cm
## Min. :2022-06-08 Min. : 0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.:2022-09-07 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :2022-12-07 Median : 0.000 Median : 0.000 Median :0.0000
## Mean :2022-12-07 Mean : 2.402 Mean : 2.205 Mean :0.1379
## 3rd Qu.:2023-03-08 3rd Qu.: 1.875 3rd Qu.: 1.300 3rd Qu.:0.0000
## Max. :2023-06-07 Max. :40.000 Max. :40.000 Max. :6.6500
## NA's :3 NA's :3 NA's :3
A function is written to read a portion of a CSV file:
partialCSVRead <- function(loc, firstRow=1L, lastRow=+Inf, col_names=TRUE, ...) {
# FUNCTION arguments
# loc: file location
# firstRow: first row that is relevant to the partial file read (whether header line or data line)
# last Row: last row that is relevant to the partial file read (+Inf means read until last line of file)
# col_names: the col_names parameter passed to readr::read_csv
# TRUE means header=TRUE (get column names from file, read data starting on next line)
# FALSE means header=FALSE (auto-generate column names, read data starting on first line)
# character vector means use these as column names (read data starting on first line)
# ...: additional arguments passed to read_csv
# Read the file and return
# skip: rows to be skipped are all those prior to firstRow
# n_max: maximum rows read are lastRow-firstRow, with an additional data row when col_names is not TRUE
readr::read_csv(loc,
skip=firstRow-1,
n_max=lastRow-firstRow+ifelse(isTRUE(col_names), 0, 1),
...
)
}
# Double check that data are the same
partialCSVRead(omFileLoc, firstRow=1L, lastRow=2L) %>% all.equal(omLocation)
## Rows: 1 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): timezone, timezone_abbreviation
## dbl (4): latitude, longitude, elevation, utc_offset_seconds
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] TRUE
partialCSVRead(omFileLoc, firstRow=4L, lastRow=8764L) %>% all.equal(omHourlyRaw)
## Rows: 8760 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): temperature_2m (°C), relativehumidity_2m (%), dewpoint_2m (°C), pr...
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] TRUE
partialCSVRead(omFileLoc, firstRow=8766L, lastRow=+Inf) %>% all.equal(omDailyRaw)
## Rows: 365 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): precipitation_sum (mm), rain_sum (mm), snowfall_sum (cm)
## date (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] TRUE
The blank lines are assessed, allowing for all tables to be read at the same time:
# Get the break points for gaps in a vector (e.g., 0, 3, 5:8, 20 has break points 0, 3, 5, 20 and 0, 3, 8, 30)
vecGaps <- function(x, addElements=c(), sortUnique=TRUE) {
if(length(addElements)>0) x <- c(addElements, x)
if(isTRUE(sortUnique)) x <- unique(sort(x))
list("starts"=c(x[is.na(lag(x)) | x-lag(x)>1], +Inf),
"ends"=x[is.na(lead(x)) | lead(x)-x>1]
)
}
vecGaps(c(3, 5:8, 20), addElements=0)
## $starts
## [1] 0 3 5 20 Inf
##
## $ends
## [1] 0 3 8 20
# Find the break points in a single file
flatFileGaps <- function(loc) {
which(stringr::str_length(readLines(loc))==0) %>% vecGaps(addElements=0)
}
flatFileGaps(omFileLoc)
## $starts
## [1] 0 3 8765 Inf
##
## $ends
## [1] 0 3 8765
# Read all relevant data as CSV with header
readMultiCSV <- function(loc, col_names=TRUE, ...) {
gaps <- flatFileGaps(loc)
lapply(seq_along(gaps$ends),
FUN=function(x) partialCSVRead(loc,
firstRow=gaps$ends[x]+1,
lastRow=gaps$starts[x+1]-1,
col_names=col_names,
...
)
)
}
tstMultiCSV <- readMultiCSV(omFileLoc)
## Rows: 1 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): timezone, timezone_abbreviation
## dbl (4): latitude, longitude, elevation, utc_offset_seconds
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8760 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): temperature_2m (°C), relativehumidity_2m (%), dewpoint_2m (°C), pr...
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 365 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): precipitation_sum (mm), rain_sum (mm), snowfall_sum (cm)
## date (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all.equal(tstMultiCSV[[1]], omLocation)
## [1] TRUE
all.equal(tstMultiCSV[[2]], omHourlyRaw)
## [1] TRUE
all.equal(tstMultiCSV[[3]], omDailyRaw)
## [1] TRUE
Data can also be downloaded through the Open-Meteo API, returning a JSON file. The data download has been completed off-line to minimize repeated hits against the server. The JSON file can then be read:
# Example download sequence
# download.file("https://archive-api.open-meteo.com/v1/archive?latitude=41.85&longitude=-87.65&start_date=2022-06-01&end_date=2023-06-08&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall&daily=precipitation_sum,rain_sum,snowfall_sum&timezone=America%2FChicago", "tempOM")
# Create hourly data tibble
jsonHourly <- jsonlite::read_json("tempOM", simplifyVector = TRUE)[["hourly"]] %>%
tibble::as_tibble() %>%
mutate(tm=lubridate::ymd_hm(time), date=date(tm))
jsonHourly
## # A tibble: 8,952 × 9
## time tempe…¹ relat…² dewpo…³ preci…⁴ rain snowf…⁵ tm
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dttm>
## 1 2022-06-01… 21 92 19.6 0.1 0.1 0 2022-06-01 00:00:00
## 2 2022-06-01… 20.6 93 19.5 0.3 0.3 0 2022-06-01 01:00:00
## 3 2022-06-01… 21 93 19.8 0 0 0 2022-06-01 02:00:00
## 4 2022-06-01… 20.8 93 19.7 0 0 0 2022-06-01 03:00:00
## 5 2022-06-01… 20.5 93 19.4 0 0 0 2022-06-01 04:00:00
## 6 2022-06-01… 19.8 95 19 0.7 0.7 0 2022-06-01 05:00:00
## 7 2022-06-01… 19.3 97 18.8 1.6 1.6 0 2022-06-01 06:00:00
## 8 2022-06-01… 19 97 18.4 1 1 0 2022-06-01 07:00:00
## 9 2022-06-01… 18.1 92 16.9 0.1 0.1 0 2022-06-01 08:00:00
## 10 2022-06-01… 16.8 87 14.6 0 0 0 2022-06-01 09:00:00
## # … with 8,942 more rows, 1 more variable: date <date>, and abbreviated
## # variable names ¹temperature_2m, ²relativehumidity_2m, ³dewpoint_2m,
## # ⁴precipitation, ⁵snowfall
# Create daily data tibble
jsonDaily <- jsonlite::read_json("tempOM", simplifyVector = TRUE)[["daily"]] %>%
tibble::as_tibble()
jsonDaily
## # A tibble: 373 × 4
## time precipitation_sum rain_sum snowfall_sum
## <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 3.8 3.8 0
## 2 2022-06-02 0 0 0
## 3 2022-06-03 0 0 0
## 4 2022-06-04 1.3 1.3 0
## 5 2022-06-05 0.3 0.3 0
## 6 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2 2 0
## 8 2022-06-08 16 16 0
## 9 2022-06-09 0 0 0
## 10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
# Extract other elements
jsonNames <- jsonlite::read_json("tempOM", simplifyVector = TRUE) %>% names
for (jsonName in jsonNames[!(jsonNames %in% c("daily", "hourly", "daily_units", "hourly_units"))]) {
cat("\n", jsonName, ":", jsonlite::read_json("tempOM", simplifyVector = TRUE)[[jsonName]])
}
##
## latitude : 41.8
## longitude : -87.6
## generationtime_ms : 2.892971
## utc_offset_seconds : -18000
## timezone : America/Chicago
## timezone_abbreviation : CDT
## elevation : 179
for (jsonName in jsonNames[jsonNames %in% c("daily_units", "hourly_units")]) {
cat("\n", jsonName, ":\n")
print(jsonlite::read_json("tempOM", simplifyVector = TRUE)[[jsonName]] %>% tibble::as_tibble() %>% t())
}
##
## hourly_units :
## [,1]
## time "iso8601"
## temperature_2m "°C"
## relativehumidity_2m "%"
## dewpoint_2m "°C"
## precipitation "mm"
## rain "mm"
## snowfall "cm"
##
## daily_units :
## [,1]
## time "iso8601"
## precipitation_sum "mm"
## rain_sum "mm"
## snowfall_sum "cm"
Daily data read from JSON and CSV are compared:
# Convert variable names in JSON daily data
jsonDailyProcess <- jsonDaily %>%
colRenamer(vecRename=c("precipitation_sum"="precip_mm",
"rain_sum"="rain_mm",
"snowfall_sum"="snow_cm",
"time"="date"
)
) %>%
mutate(date=as.Date(date))
jsonDailyProcess
## # A tibble: 373 × 4
## date precip_mm rain_mm snow_cm
## <date> <dbl> <dbl> <dbl>
## 1 2022-06-01 3.8 3.8 0
## 2 2022-06-02 0 0 0
## 3 2022-06-03 0 0 0
## 4 2022-06-04 1.3 1.3 0
## 5 2022-06-05 0.3 0.3 0
## 6 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2 2 0
## 8 2022-06-08 16 16 0
## 9 2022-06-09 0 0 0
## 10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
# Check dates included
omDailyProcess %>%
select(date) %>%
mutate(inCSV=1) %>%
full_join(mutate(select(jsonDailyProcess, "date"), inJSON=1), by="date") %>%
filter(!complete.cases(.))
## # A tibble: 8 × 3
## date inCSV inJSON
## <date> <dbl> <dbl>
## 1 2022-06-01 NA 1
## 2 2022-06-02 NA 1
## 3 2022-06-03 NA 1
## 4 2022-06-04 NA 1
## 5 2022-06-05 NA 1
## 6 2022-06-06 NA 1
## 7 2022-06-07 NA 1
## 8 2023-06-08 NA 1
# Check column names
all.equal(names(omDailyProcess), names(jsonDailyProcess))
## [1] TRUE
# Check data elements from 2022-06-08 through 2023-06-04 (last full day of data)
all.equal(omDailyProcess %>% tibble::as_tibble() %>% filter(date>="2022-06-08", date<="2023-06-04"),
jsonDailyProcess %>% filter(date>="2022-06-08", date<="2023-06-04")
)
## [1] TRUE
Hourly data read from JSON and CSV are compared:
# Convert variable names in JSON hourly data
jsonHourlyProcess <- jsonHourly %>%
select(-time) %>%
colRenamer(vecRename=c("temperature_2m"="temp2m_C",
"relativehumidity_2m"="relH2m",
"dewpoint_2m"="dew2m_C",
"precipitation"="precip_mm",
"rain"="rain_mm",
"snowfall"="snow_cm",
"tm"="time"
)
) %>%
select(time, everything())
jsonHourlyProcess
## # A tibble: 8,952 × 8
## time temp2…¹ relH2m dew2m_C preci…² rain_mm snow_cm date
## <dttm> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <date>
## 1 2022-06-01 00:00:00 21 92 19.6 0.1 0.1 0 2022-06-01
## 2 2022-06-01 01:00:00 20.6 93 19.5 0.3 0.3 0 2022-06-01
## 3 2022-06-01 02:00:00 21 93 19.8 0 0 0 2022-06-01
## 4 2022-06-01 03:00:00 20.8 93 19.7 0 0 0 2022-06-01
## 5 2022-06-01 04:00:00 20.5 93 19.4 0 0 0 2022-06-01
## 6 2022-06-01 05:00:00 19.8 95 19 0.7 0.7 0 2022-06-01
## 7 2022-06-01 06:00:00 19.3 97 18.8 1.6 1.6 0 2022-06-01
## 8 2022-06-01 07:00:00 19 97 18.4 1 1 0 2022-06-01
## 9 2022-06-01 08:00:00 18.1 92 16.9 0.1 0.1 0 2022-06-01
## 10 2022-06-01 09:00:00 16.8 87 14.6 0 0 0 2022-06-01
## # … with 8,942 more rows, and abbreviated variable names ¹temp2m_C, ²precip_mm
# Check dates included
omHourlyProcess %>%
count(date, name="nCSV") %>%
full_join(count(jsonHourlyProcess, date, name="nJSON"), by="date") %>%
filter(!complete.cases(.))
## # A tibble: 8 × 3
## date nCSV nJSON
## <date> <int> <int>
## 1 2022-06-01 NA 24
## 2 2022-06-02 NA 24
## 3 2022-06-03 NA 24
## 4 2022-06-04 NA 24
## 5 2022-06-05 NA 24
## 6 2022-06-06 NA 24
## 7 2022-06-07 NA 24
## 8 2023-06-08 NA 24
# Check column names
all.equal(names(omHourlyProcess), names(jsonHourlyProcess))
## [1] TRUE
# Check data elements from 2022-06-08 through 2023-06-04 (last full day of data)
all.equal(omHourlyProcess %>% tibble::as_tibble() %>% filter(date>="2022-06-08", date<="2023-06-04"),
jsonHourlyProcess %>% filter(date>="2022-06-08", date<="2023-06-04")
)
## [1] TRUE
Metrics that can be reuested for hourly and daily data include:
hourlyMetrics <- "temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm"
dailyMetrics <- "weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration"
hourlyDescription <- "Air temperature at 2 meters above ground\nRelative humidity at 2 meters above ground\nDew point temperature at 2 meters above ground\nApparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nTotal precipitation (rain, showers, snow) sum of the preceding hour. Data is stored with a 0.1 mm precision. If precipitation data is summed up to monthly sums, there might be small inconsistencies with the total precipitation amount.\nOnly liquid precipitation of the preceding hour including local showers and rain from large scale systems.\nSnowfall amount of the preceding hour in centimeters. For the water equivalent in millimeter, divide by 7. E.g. 7 cm snow = 10 mm precipitation water equivalent\nTotal cloud cover as an area fraction\nLow level clouds and fog up to 2 km altitude\nMid level clouds from 2 to 6 km altitude\nHigh level clouds from 6 km altitude\nShortwave solar radiation as average of the preceding hour. This is equal to the total global horizontal irradiation\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDiffuse solar radiation as average of the preceding hour\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind direction at 10 or 100 meters above ground\nWind direction at 10 or 100 meters above ground\nGusts at 10 meters above ground of the indicated hour. Wind gusts in CERRA are defined as the maximum wind gusts of the preceding hour. Please consult the ECMWF IFS documentation for more information on how wind gusts are parameterized in weather models.\nET0 Reference Evapotranspiration of a well watered grass field. Based on FAO-56 Penman-Monteith equations ET0 is calculated from temperature, wind speed, humidity and solar radiation. Unlimited soil water is assumed. ET0 is commonly used to estimate the required irrigation for plants.\nWeather condition as a numeric code. Follow WMO weather interpretation codes. See table below for details. Weather code is calculated from cloud cover analysis, precipitation and snowfall. As barely no information about atmospheric stability is available, estimation about thunderstorms is not possible.\nVapor Pressure Deificit (VPD) in kilopascal (kPa). For high VPD (>1.6), water transpiration of plants increases. For low VPD (<0.4), transpiration decreases\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths."
dailyDescription <- "The most severe weather condition on a given day\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily apparent temperature\nMaximum and minimum daily apparent temperature\nSum of daily precipitation (including rain, showers and snowfall)\nSum of daily rain\nSum of daily snowfall\nThe number of hours with rain\nSun rise and set times\nSun rise and set times\nMaximum wind speed and gusts on a day\nMaximum wind speed and gusts on a day\nDominant wind direction\nThe sum of solar radiaion on a given day in Megajoules\nDaily sum of ET0 Reference Evapotranspiration of a well watered grass field"
# Create tibble for hourly metrics
tblMetricsHourly <- tibble::tibble(metric=hourlyMetrics %>% str_split_1(","),
description=hourlyDescription %>% str_split_1("\n")
)
tblMetricsHourly %>%
print(n=50)
## # A tibble: 33 × 2
## metric description
## <chr> <chr>
## 1 temperature_2m Air temperature at 2 meters above ground
## 2 relativehumidity_2m Relative humidity at 2 meters above ground
## 3 dewpoint_2m Dew point temperature at 2 meters above ground
## 4 apparent_temperature Apparent temperature is the perceived feels-li…
## 5 pressure_msl Atmospheric air pressure reduced to mean sea l…
## 6 surface_pressure Atmospheric air pressure reduced to mean sea l…
## 7 precipitation Total precipitation (rain, showers, snow) sum …
## 8 rain Only liquid precipitation of the preceding hou…
## 9 snowfall Snowfall amount of the preceding hour in centi…
## 10 cloudcover Total cloud cover as an area fraction
## 11 cloudcover_low Low level clouds and fog up to 2 km altitude
## 12 cloudcover_mid Mid level clouds from 2 to 6 km altitude
## 13 cloudcover_high High level clouds from 6 km altitude
## 14 shortwave_radiation Shortwave solar radiation as average of the pr…
## 15 direct_radiation Direct solar radiation as average of the prece…
## 16 direct_normal_irradiance Direct solar radiation as average of the prece…
## 17 diffuse_radiation Diffuse solar radiation as average of the prec…
## 18 windspeed_10m Wind speed at 10 or 100 meters above ground. W…
## 19 windspeed_100m Wind speed at 10 or 100 meters above ground. W…
## 20 winddirection_10m Wind direction at 10 or 100 meters above ground
## 21 winddirection_100m Wind direction at 10 or 100 meters above ground
## 22 windgusts_10m Gusts at 10 meters above ground of the indicat…
## 23 et0_fao_evapotranspiration ET0 Reference Evapotranspiration of a well wat…
## 24 weathercode Weather condition as a numeric code. Follow WM…
## 25 vapor_pressure_deficit Vapor Pressure Deificit (VPD) in kilopascal (k…
## 26 soil_temperature_0_to_7cm Average temperature of different soil levels b…
## 27 soil_temperature_7_to_28cm Average temperature of different soil levels b…
## 28 soil_temperature_28_to_100cm Average temperature of different soil levels b…
## 29 soil_temperature_100_to_255cm Average temperature of different soil levels b…
## 30 soil_moisture_0_to_7cm Average soil water content as volumetric mixin…
## 31 soil_moisture_7_to_28cm Average soil water content as volumetric mixin…
## 32 soil_moisture_28_to_100cm Average soil water content as volumetric mixin…
## 33 soil_moisture_100_to_255cm Average soil water content as volumetric mixin…
# Create tibble for daily metrics
tblMetricsDaily <- tibble::tibble(metric=dailyMetrics %>% str_split_1(","),
description=dailyDescription %>% str_split_1("\n")
)
tblMetricsDaily
## # A tibble: 16 × 2
## metric description
## <chr> <chr>
## 1 weathercode The most severe weather condition on a given day
## 2 temperature_2m_max Maximum and minimum daily air temperature at 2 me…
## 3 temperature_2m_min Maximum and minimum daily air temperature at 2 me…
## 4 apparent_temperature_max Maximum and minimum daily apparent temperature
## 5 apparent_temperature_min Maximum and minimum daily apparent temperature
## 6 precipitation_sum Sum of daily precipitation (including rain, showe…
## 7 rain_sum Sum of daily rain
## 8 snowfall_sum Sum of daily snowfall
## 9 precipitation_hours The number of hours with rain
## 10 sunrise Sun rise and set times
## 11 sunset Sun rise and set times
## 12 windspeed_10m_max Maximum wind speed and gusts on a day
## 13 windgusts_10m_max Maximum wind speed and gusts on a day
## 14 winddirection_10m_dominant Dominant wind direction
## 15 shortwave_radiation_sum The sum of solar radiaion on a given day in Megaj…
## 16 et0_fao_evapotranspiration Daily sum of ET0 Reference Evapotranspiration of …
Data can then be assembled into a string that is compatible with the Open-Meteo API format:
openMeteoURLCreate <- function(mainURL="https://archive-api.open-meteo.com/v1/archive",
lat=45,
lon=-90,
startDate=paste(year(Sys.Date())-1, "01", "01", sep="-"),
endDate=paste(year(Sys.Date())-1, "12", "31", sep="-"),
hourlyMetrics=NULL,
dailyMetrics=NULL,
tz="GMT",
...
) {
# Create formatted string
fString <- paste0(mainURL,
"?latitude=",
lat,
"&longitude=",
lon,
"&start_date=",
startDate,
"&end_date=",
endDate
)
if(!is.null(hourlyMetrics)) fString <- paste0(fString, "&hourly=", hourlyMetrics)
if(!is.null(dailyMetrics)) fString <- paste0(fString, "&daily=", dailyMetrics)
# Return the formatted string
paste0(fString, "&timezone=", stringr::str_replace(tz, "/", "%2F"), ...)
}
# Blank example
openMeteoURLCreate()
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=45&longitude=-90&start_date=2022-01-01&end_date=2022-12-31&timezone=GMT"
# Matching previous CSV data pull
openMeteoURLCreate(lat=41.85,
lon=-87.65,
startDate="2022-06-01",
endDate="2023-06-08",
hourlyMetrics="temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall",
dailyMetrics="precipitation_sum,rain_sum,snowfall_sum",
tz="America/Chicago"
)
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.85&longitude=-87.65&start_date=2022-06-01&end_date=2023-06-08&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall&daily=precipitation_sum,rain_sum,snowfall_sum&timezone=America%2FChicago"
A helper function is created to convert cities to lat/lon and to allow for selection of hourly and daily metrics by index number:
helperOpenMeteoURL <- function(cityName=NULL,
lat=NULL,
lon=NULL,
hourlyMetrics=NULL,
hourlyIndices=NULL,
hourlyDesc=tblMetricsHourly,
dailyMetrics=NULL,
dailyIndices=NULL,
dailyDesc=tblMetricsDaily,
startDate=NULL,
endDate=NULL,
tz=NULL,
...
) {
# Convert city to lat/lon if lat/lon are NULL
if(is.null(lat) | is.null(lon)) {
if(is.null(cityName)) stop("\nMust provide lat/lon or city name available in maps::us.cities\n")
cityData <- maps::us.cities %>% tibble::as_tibble() %>% filter(name==cityName)
if(nrow(cityData)!=1) stop("\nMust provide city name that maps uniquely to maps::us.cities$name\n")
lat <- cityData$lat[1]
lon <- cityData$long[1]
}
# Get hourly metrics by index if relevant
if(is.null(hourlyMetrics) & !is.null(hourlyIndices)) {
hourlyMetrics <- hourlyDesc %>% slice(hourlyIndices) %>% pull(metric)
hourlyMetrics <- paste0(hourlyMetrics, collapse=",")
cat("\nHourly metrics created from indices:", hourlyMetrics, "\n\n")
}
# Get daily metrics by index if relevant
if(is.null(dailyMetrics) & !is.null(dailyIndices)) {
dailyMetrics <- dailyDesc %>% slice(dailyIndices) %>% pull(metric)
dailyMetrics <- paste0(dailyMetrics, collapse=",")
cat("\nDaily metrics created from indices:", dailyMetrics, "\n\n")
}
# Use default values from OpenMeteoURLCreate() for startDate, endDate, and tz if passed as NULL
if(is.null(startDate)) startDate <- eval(formals(openMeteoURLCreate)$startDate)
if(is.null(endDate)) endDate <- eval(formals(openMeteoURLCreate)$endDate)
if(is.null(tz)) tz <- eval(formals(openMeteoURLCreate)$tz)
# Create and return URL
openMeteoURLCreate(lat=lat,
lon=lon,
startDate=startDate,
endDate=endDate,
hourlyMetrics=hourlyMetrics,
dailyMetrics=dailyMetrics,
tz=tz,
...
)
}
The URL is tested for file download, cached to avoid multiple hits to the server:
testURL <- helperOpenMeteoURL(cityName="Chicago IL",
hourlyIndices=c(1:3, 7:9),
dailyIndices=6:8,
startDate="2022-06-01",
endDate="2023-06-08",
tz="America/Chicago"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall
##
##
## Daily metrics created from indices: precipitation_sum,rain_sum,snowfall_sum
testURL
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2022-06-01&end_date=2023-06-08&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,precipitation,rain,snowfall&daily=precipitation_sum,rain_sum,snowfall_sum&timezone=America%2FChicago"
# Download file
if(!file.exists("notuse_testOM.json")) {
fileDownload(fileName="notuse_testOM.json", url=testURL)
} else {
cat("\nFile notuse_testOM.json already exists, skipping download\n")
}
## size isdir mode mtime ctime
## notuse_testOM.json 426872 FALSE 666 2023-06-19 08:56:49 2023-06-19 08:56:45
## atime exe
## notuse_testOM.json 2023-06-19 08:56:49 no
Code is created to read the JSON return object:
readOpenMeteoJSON <- function(js) {
# FUNCTION arguments:
# js: JSON list returned by download from Open-Meteo
# Get the object and names
jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
nms <- jsObj %>% names()
cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
# Set default objects as NULL
tblDaily <- NULL
tblHourly <- NULL
tblUnitsDaily <- NULL
tblUnitsHourly <- NULL
# Get daily and hourly as tibble if relevant
if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble()
if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble()
# Helper function for unit conversions
helperMetricUnit <- function(x, mapper, desc) {
x %>%
tibble::as_tibble() %>%
pivot_longer(cols=everything()) %>%
left_join(mapper, by=c("name"="metric")) %>%
mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>%
mutate(metricType=desc) %>%
select(metricType, everything())
}
# Get the unit descriptions
if("daily_units" %in% nms)
tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, tblMetricsDaily, desc="daily_units")
if("hourly_units" %in% nms)
tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, tblMetricsHourly, desc="hourly_units")
if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly))
tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
else tblUnits <- NULL
# Put everything else together
tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
tibble::as_tibble()
# Return the list objects
list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
}
prettyOpenMeteoMeta <- function(df, extr="tblDescription") {
if("list" %in% class(df)) df <- df[[extr]]
for(name in names(df)) {
cat("\n", name, ": ", df %>% pull(name), sep="")
}
cat("\n\n")
}
tmpOM <- readOpenMeteoJSON("notuse_testOM.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly, daily_units, daily
tmpOM
## $tblDaily
## # A tibble: 373 × 4
## time precipitation_sum rain_sum snowfall_sum
## <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 3.8 3.8 0
## 2 2022-06-02 0 0 0
## 3 2022-06-03 0 0 0
## 4 2022-06-04 1.3 1.3 0
## 5 2022-06-05 0.3 0.3 0
## 6 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2 2 0
## 8 2022-06-08 16 16 0
## 9 2022-06-09 0 0 0
## 10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
##
## $tblHourly
## # A tibble: 8,952 × 7
## time temperature_2m relativehumid…¹ dewpo…² preci…³ rain snowf…⁴
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2022-06-01T00:00 21 92 19.7 0.1 0.1 0
## 2 2022-06-01T01:00 20.6 94 19.6 0.3 0.3 0
## 3 2022-06-01T02:00 21.1 93 19.9 0 0 0
## 4 2022-06-01T03:00 20.8 93 19.7 0 0 0
## 5 2022-06-01T04:00 20.5 93 19.3 0 0 0
## 6 2022-06-01T05:00 19.7 95 19 0.7 0.7 0
## 7 2022-06-01T06:00 19.4 96 18.8 1.6 1.6 0
## 8 2022-06-01T07:00 19.2 96 18.5 1 1 0
## 9 2022-06-01T08:00 18.6 90 17 0.1 0.1 0
## 10 2022-06-01T09:00 17.7 84 14.9 0 0 0
## # … with 8,942 more rows, and abbreviated variable names ¹relativehumidity_2m,
## # ²dewpoint_2m, ³precipitation, ⁴snowfall
##
## $tblUnits
## # A tibble: 11 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above g…
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters above…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters a…
## 5 hourly_units precipitation mm Total precipitation (rain, showers,…
## 6 hourly_units rain mm Only liquid precipitation of the pr…
## 7 hourly_units snowfall cm Snowfall amount of the preceding ho…
## 8 daily_units time iso8601 <NA>
## 9 daily_units precipitation_sum mm Sum of daily precipitation (includi…
## 10 daily_units rain_sum mm Sum of daily rain
## 11 daily_units snowfall_sum cm Sum of daily snowfall
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 2.66 -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOM)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 2.658963
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
Conversion functions are written for hourly and daily data:
omProcessDaily <- function(tbl, extr="tblDaily") {
if("list" %in% class(tbl)) tbl <- tbl[[extr]]
tbl %>% mutate(date=lubridate::ymd(time)) %>% select(date, everything())
}
omProcessHourly <- function(tbl, extr="tblHourly") {
if("list" %in% class(tbl)) tbl <- tbl[[extr]]
tbl %>%
mutate(origTime=time,
time=lubridate::ymd_hm(time),
date=lubridate::date(time),
hour=lubridate::hour(time)
) %>%
select(time, date, hour, everything())
}
omProcessDaily(tmpOM)
## # A tibble: 373 × 5
## date time precipitation_sum rain_sum snowfall_sum
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 2022-06-01 3.8 3.8 0
## 2 2022-06-02 2022-06-02 0 0 0
## 3 2022-06-03 2022-06-03 0 0 0
## 4 2022-06-04 2022-06-04 1.3 1.3 0
## 5 2022-06-05 2022-06-05 0.3 0.3 0
## 6 2022-06-06 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2022-06-07 2 2 0
## 8 2022-06-08 2022-06-08 16 16 0
## 9 2022-06-09 2022-06-09 0 0 0
## 10 2022-06-10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
omProcessHourly(tmpOM)
## # A tibble: 8,952 × 10
## time date hour temperat…¹ relat…² dewpo…³ preci…⁴ rain
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2022-06-01 00:00:00 2022-06-01 0 21 92 19.7 0.1 0.1
## 2 2022-06-01 01:00:00 2022-06-01 1 20.6 94 19.6 0.3 0.3
## 3 2022-06-01 02:00:00 2022-06-01 2 21.1 93 19.9 0 0
## 4 2022-06-01 03:00:00 2022-06-01 3 20.8 93 19.7 0 0
## 5 2022-06-01 04:00:00 2022-06-01 4 20.5 93 19.3 0 0
## 6 2022-06-01 05:00:00 2022-06-01 5 19.7 95 19 0.7 0.7
## 7 2022-06-01 06:00:00 2022-06-01 6 19.4 96 18.8 1.6 1.6
## 8 2022-06-01 07:00:00 2022-06-01 7 19.2 96 18.5 1 1
## 9 2022-06-01 08:00:00 2022-06-01 8 18.6 90 17 0.1 0.1
## 10 2022-06-01 09:00:00 2022-06-01 9 17.7 84 14.9 0 0
## # … with 8,942 more rows, 2 more variables: snowfall <dbl>, origTime <chr>, and
## # abbreviated variable names ¹temperature_2m, ²relativehumidity_2m,
## # ³dewpoint_2m, ⁴precipitation
Function readOpenMeteoJSON() is updated to automatically incorporate date conversions:
readOpenMeteoJSON <- function(js, mapDaily=tblMetricsDaily, mapHourly=tblMetricsHourly) {
# FUNCTION arguments:
# js: JSON list returned by download from Open-Meteo
# mapDaily: mapping file for daily metrics
# mapHourly: mapping file for hourly metrics
# Get the object and names
jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
nms <- jsObj %>% names()
cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
# Set default objects as NULL
tblDaily <- NULL
tblHourly <- NULL
tblUnitsDaily <- NULL
tblUnitsHourly <- NULL
# Get daily and hourly as tibble if relevant
if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble() %>% omProcessDaily()
if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble() %>% omProcessHourly()
# Helper function for unit conversions
helperMetricUnit <- function(x, mapper, desc=NULL) {
if(is.null(desc))
desc <- as.list(match.call())$x %>%
deparse() %>%
stringr::str_replace_all(pattern=".*\\$", replacement="")
x %>%
tibble::as_tibble() %>%
pivot_longer(cols=everything()) %>%
left_join(mapper, by=c("name"="metric")) %>%
mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>%
mutate(metricType=desc) %>%
select(metricType, everything())
}
# Get the unit descriptions
if("daily_units" %in% nms) tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, mapDaily)
if("hourly_units" %in% nms) tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, mapHourly)
if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly))
tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
else tblUnits <- NULL
# Put everything else together
tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
tibble::as_tibble()
# Return the list objects
list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
}
tmpOM2 <- readOpenMeteoJSON("notuse_testOM.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly, daily_units, daily
tmpOM2
## $tblDaily
## # A tibble: 373 × 5
## date time precipitation_sum rain_sum snowfall_sum
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2022-06-01 2022-06-01 3.8 3.8 0
## 2 2022-06-02 2022-06-02 0 0 0
## 3 2022-06-03 2022-06-03 0 0 0
## 4 2022-06-04 2022-06-04 1.3 1.3 0
## 5 2022-06-05 2022-06-05 0.3 0.3 0
## 6 2022-06-06 2022-06-06 12.5 12.5 0
## 7 2022-06-07 2022-06-07 2 2 0
## 8 2022-06-08 2022-06-08 16 16 0
## 9 2022-06-09 2022-06-09 0 0 0
## 10 2022-06-10 2022-06-10 0.6 0.6 0
## # … with 363 more rows
##
## $tblHourly
## # A tibble: 8,952 × 10
## time date hour temperat…¹ relat…² dewpo…³ preci…⁴ rain
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2022-06-01 00:00:00 2022-06-01 0 21 92 19.7 0.1 0.1
## 2 2022-06-01 01:00:00 2022-06-01 1 20.6 94 19.6 0.3 0.3
## 3 2022-06-01 02:00:00 2022-06-01 2 21.1 93 19.9 0 0
## 4 2022-06-01 03:00:00 2022-06-01 3 20.8 93 19.7 0 0
## 5 2022-06-01 04:00:00 2022-06-01 4 20.5 93 19.3 0 0
## 6 2022-06-01 05:00:00 2022-06-01 5 19.7 95 19 0.7 0.7
## 7 2022-06-01 06:00:00 2022-06-01 6 19.4 96 18.8 1.6 1.6
## 8 2022-06-01 07:00:00 2022-06-01 7 19.2 96 18.5 1 1
## 9 2022-06-01 08:00:00 2022-06-01 8 18.6 90 17 0.1 0.1
## 10 2022-06-01 09:00:00 2022-06-01 9 17.7 84 14.9 0 0
## # … with 8,942 more rows, 2 more variables: snowfall <dbl>, origTime <chr>, and
## # abbreviated variable names ¹temperature_2m, ²relativehumidity_2m,
## # ³dewpoint_2m, ⁴precipitation
##
## $tblUnits
## # A tibble: 11 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above g…
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters above…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters a…
## 5 hourly_units precipitation mm Total precipitation (rain, showers,…
## 6 hourly_units rain mm Only liquid precipitation of the pr…
## 7 hourly_units snowfall cm Snowfall amount of the preceding ho…
## 8 daily_units time iso8601 <NA>
## 9 daily_units precipitation_sum mm Sum of daily precipitation (includi…
## 10 daily_units rain_sum mm Sum of daily rain
## 11 daily_units snowfall_sum cm Sum of daily snowfall
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 2.66 -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOM2)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 2.658963
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
identical(tmpOM$tblUnits, tmpOM2$tblUnits)
## [1] TRUE
identical(tmpOM$tblDescription, tmpOM2$tblDescription)
## [1] TRUE
identical(tmpOM$tblDaily %>% omProcessDaily(), tmpOM2$tblDaily)
## [1] TRUE
identical(tmpOM$tblHourly %>% omProcessHourly(), tmpOM2$tblHourly)
## [1] TRUE
The daily data is tested for file download, cached to avoid multiple hits to the server:
testURLDaily <- helperOpenMeteoURL(cityName="Chicago IL",
dailyIndices=1:nrow(tblMetricsDaily),
startDate="2010-01-01",
endDate="2023-06-15",
tz="America/Chicago"
)
##
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2010-01-01&end_date=2023-06-15&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=America%2FChicago"
# Download file
if(!file.exists("notuse_testOM_daily.json")) {
fileDownload(fileName="notuse_testOM_daily.json", url=testURLDaily)
} else {
cat("\nFile notuse_testOM_daily.json already exists, skipping download\n")
}
## size isdir mode mtime
## notuse_testOM_daily.json 573218 FALSE 666 2023-06-23 07:53:04
## ctime atime exe
## notuse_testOM_daily.json 2023-06-23 07:52:59 2023-06-23 07:53:04 no
Data are read and stored as a list:
tmpOMDaily <- readOpenMeteoJSON("notuse_testOM_daily.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
tmpOMDaily
## $tblDaily
## # A tibble: 4,914 × 18
## date time weath…¹ tempe…² tempe…³ appar…⁴ appar…⁵ preci…⁶ rain_…⁷
## <date> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 2010-01-01 3 -8.6 -13.4 -14.9 -19.6 0 0
## 2 2010-01-02 2010-01-02 2 -10.4 -15.1 -17.5 -21.7 0 0
## 3 2010-01-03 2010-01-03 3 -7.9 -13.8 -13.6 -20.1 0 0
## 4 2010-01-04 2010-01-04 3 -6.9 -12.3 -12.8 -18.9 0 0
## 5 2010-01-05 2010-01-05 3 -4.8 -9.8 -10.1 -15.7 0 0
## 6 2010-01-06 2010-01-06 71 -4.9 -9 -9.2 -14.4 0 0
## 7 2010-01-07 2010-01-07 73 -5.2 -8.5 -9.3 -13 7.5 0
## 8 2010-01-08 2010-01-08 73 -3 -9.4 -9.2 -15.3 2.3 0
## 9 2010-01-09 2010-01-09 3 -5.8 -12.3 -10.8 -18.2 0 0
## 10 2010-01-10 2010-01-10 3 -8.8 -19.4 -16.2 -25.6 0 0
## # … with 4,904 more rows, 9 more variables: snowfall_sum <dbl>,
## # precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## # windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## # winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## # et0_fao_evapotranspiration <dbl>, and abbreviated variable names
## # ¹weathercode, ²temperature_2m_max, ³temperature_2m_min,
## # ⁴apparent_temperature_max, ⁵apparent_temperature_min, ⁶precipitation_sum, …
##
## $tblHourly
## NULL
##
## $tblUnits
## # A tibble: 17 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units time "iso8601" <NA>
## 2 daily_units weathercode "wmo code" The most severe weather co…
## 3 daily_units temperature_2m_max "deg C" Maximum and minimum daily …
## 4 daily_units temperature_2m_min "deg C" Maximum and minimum daily …
## 5 daily_units apparent_temperature_max "deg C" Maximum and minimum daily …
## 6 daily_units apparent_temperature_min "deg C" Maximum and minimum daily …
## 7 daily_units precipitation_sum "mm" Sum of daily precipitation…
## 8 daily_units rain_sum "mm" Sum of daily rain
## 9 daily_units snowfall_sum "cm" Sum of daily snowfall
## 10 daily_units precipitation_hours "h" The number of hours with r…
## 11 daily_units sunrise "iso8601" Sun rise and set times
## 12 daily_units sunset "iso8601" Sun rise and set times
## 13 daily_units windspeed_10m_max "km/h" Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max "km/h" Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg " Dominant wind direction
## 16 daily_units shortwave_radiation_sum "MJ/m²" The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm" Daily sum of ET0 Reference…
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 508. -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOMDaily)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 508.3281
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
# Exploration of precipitation hours by day
tmpOMDaily$tblDaily %>% count(precipitation_hours) %>% print(n=30)
## # A tibble: 25 × 2
## precipitation_hours n
## <dbl> <int>
## 1 0 2499
## 2 1 287
## 3 2 288
## 4 3 231
## 5 4 180
## 6 5 201
## 7 6 152
## 8 7 157
## 9 8 123
## 10 9 121
## 11 10 98
## 12 11 90
## 13 12 74
## 14 13 53
## 15 14 70
## 16 15 41
## 17 16 54
## 18 17 48
## 19 18 36
## 20 19 31
## 21 20 22
## 22 21 11
## 23 22 15
## 24 23 18
## 25 24 14
tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
ggplot(aes(x=precipitation_hours)) +
geom_density(aes(group=lubridate::year(date), color=as.factor(lubridate::year(date)))) +
scale_color_discrete("Year") +
labs(title="Hours of Precipitation per Day", x="Hours of Precipitation", y="Annual density")
tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
ggplot(aes(x=precipitation_hours)) +
geom_histogram(aes(fill=as.factor(lubridate::year(date))), bins=25) +
scale_fill_discrete("Year") +
facet_wrap(~lubridate::year(date)) +
labs(title="Hours of Precipitation per Day", x="Hours of Precipitation", y="# Days")
Precipitation by month is explored:
dfPrecip <- tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
select(date, precipitation_sum, rain_sum, snowfall_sum) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date)
) %>%
group_by(yyyymm, month) %>%
summarize(across(where(is.numeric), sum), n=n(), .groups="drop")
dfPrecip
## # A tibble: 156 × 6
## yyyymm month precipitation_sum rain_sum snowfall_sum n
## <chr> <fct> <dbl> <dbl> <dbl> <int>
## 1 2010-01 Jan 25.9 12.6 10.8 31
## 2 2010-02 Feb 36.1 0.1 28.6 28
## 3 2010-03 Mar 58 47.7 7.21 31
## 4 2010-04 Apr 100. 100. 0 30
## 5 2010-05 May 154. 154. 0 31
## 6 2010-06 Jun 226. 226. 0 30
## 7 2010-07 Jul 145. 145. 0 31
## 8 2010-08 Aug 66.4 66.4 0 31
## 9 2010-09 Sep 104. 104. 0 30
## 10 2010-10 Oct 60.7 60.7 0 31
## # … with 146 more rows
# Boxplot of precipitation by month
dfPrecip %>%
select(-n) %>%
pivot_longer(-c(yyyymm, month)) %>%
ggplot(aes(x=month, y=ifelse(name=="snowfall_sum", 10*value, value))) +
geom_boxplot(fill="lightblue") +
facet_wrap(~name, scales="free_y") +
labs(title="Precipitation by month (2010-2022)", y="Precipitation (mm)", x=NULL) +
theme(axis.text.x = element_text(angle = 90)) +
lims(y=c(0, NA))
# Mean precipitation by month
dfPrecip %>%
group_by(month) %>%
summarize(across(where(is.numeric), mean)) %>%
ggplot(aes(x=month)) +
geom_col(aes(y=precipitation_sum), fill="green") +
geom_col(aes(y=rain_sum), fill="lightblue") +
geom_text(aes(y=rain_sum/2, label=round(rain_sum))) +
geom_text(aes(y=rain_sum/2 + precipitation_sum/2,
label=ifelse(precipitation_sum>rain_sum+3, round(precipitation_sum-rain_sum), "")
)
) +
geom_text(aes(y=precipitation_sum+5, label=round(precipitation_sum))) +
labs(x=NULL,
y="Precipitation (mm)",
title="Mean precipitation by month (2010-2022)",
subtitle="Light blue is mm falling as rain, green is liquid equivalent of other"
)
Average temperatures by month are also explored:
dfTemp <- tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
select(date,
temperature_2m_max,
temperature_2m_min,
apparent_temperature_max,
apparent_temperature_min
) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date)
) %>%
group_by(yyyymm, month) %>%
summarize(across(where(is.numeric), mean), n=n(), .groups="drop")
dfTemp
## # A tibble: 156 × 7
## yyyymm month temperature_2m_max temperature_2m_min apparent_…¹ appar…² n
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2010-01 Jan -2.46 -8.18 -7.64 -13.7 31
## 2 2010-02 Feb -0.414 -7.31 -4.85 -12.4 28
## 3 2010-03 Mar 7.67 -0.452 4.25 -5.13 31
## 4 2010-04 Apr 16.9 7.40 14.6 3.56 30
## 5 2010-05 May 20.5 13.1 20.5 11.3 31
## 6 2010-06 Jun 26.0 19.1 28.0 19.5 30
## 7 2010-07 Jul 29.1 21.7 32.0 23.5 31
## 8 2010-08 Aug 28.6 21.5 31.2 23.0 31
## 9 2010-09 Sep 22.7 16.0 22.1 14.8 30
## 10 2010-10 Oct 18.1 10.3 15.4 7.14 31
## # … with 146 more rows, and abbreviated variable names
## # ¹apparent_temperature_max, ²apparent_temperature_min
# Boxplot of precipitation by month
dfTemp %>%
select(-n) %>%
pivot_longer(-c(yyyymm, month)) %>%
ggplot(aes(x=month, y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~name) +
labs(title="Average temperature by month (2010-2022)", y="Average temperature (C)", x=NULL) +
theme(axis.text.x = element_text(angle = 90))
# Mean temperatures by month
dfTemp %>%
select(-n) %>%
group_by(month) %>%
summarize(across(where(is.numeric), mean)) %>%
pivot_longer(cols=-c(month)) %>%
mutate(measType=stringr::str_replace(name, ".*_", ""),
meas=ifelse(str_detect(name, "apparent"), "apparent", "actual")
) %>%
select(-name) %>%
pivot_wider(id_cols=c(month, meas), names_from="measType", values_from="value") %>%
ggplot(aes(x=month)) +
geom_tile(aes(y=(max+min)/2, height=max-min), width=0.5, fill="lightblue") +
geom_text(aes(y=max+1, label=round(max, 1)), size=2.5) +
geom_text(aes(y=min-1, label=round(min, 1)), size=2.5) +
labs(x=NULL,
y="Temperature (C)",
title="Mean high and low temperature by month (2010-2022)",
subtitle="Actual temperature and apparent temperature"
) +
facet_wrap(~meas)
Sunrise and sunset times are explored:
dfSun <- tmpOMDaily$tblDaily %>%
filter(lubridate::year(date)<=2022) %>%
select(date, sunrise, sunset) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date),
across(c(sunrise, sunset), lubridate::ymd_hm),
sr=hms::as_hms(sunrise),
ss=hms::as_hms(sunset),
doy=lubridate::yday(date),
year=lubridate::year(date)
)
dfSun
## # A tibble: 4,748 × 9
## date sunrise sunset month yyyymm sr ss
## <date> <dttm> <dttm> <fct> <chr> <time> <tim>
## 1 2010-01-01 2010-01-01 08:16:00 2010-01-01 17:32:00 Jan 2010-01 08:16 17:32
## 2 2010-01-02 2010-01-02 08:16:00 2010-01-02 17:33:00 Jan 2010-01 08:16 17:33
## 3 2010-01-03 2010-01-03 08:16:00 2010-01-03 17:34:00 Jan 2010-01 08:16 17:34
## 4 2010-01-04 2010-01-04 08:16:00 2010-01-04 17:34:00 Jan 2010-01 08:16 17:34
## 5 2010-01-05 2010-01-05 08:16:00 2010-01-05 17:35:00 Jan 2010-01 08:16 17:35
## 6 2010-01-06 2010-01-06 08:16:00 2010-01-06 17:36:00 Jan 2010-01 08:16 17:36
## 7 2010-01-07 2010-01-07 08:16:00 2010-01-07 17:37:00 Jan 2010-01 08:16 17:37
## 8 2010-01-08 2010-01-08 08:16:00 2010-01-08 17:38:00 Jan 2010-01 08:16 17:38
## 9 2010-01-09 2010-01-09 08:16:00 2010-01-09 17:39:00 Jan 2010-01 08:16 17:39
## 10 2010-01-10 2010-01-10 08:15:00 2010-01-10 17:40:00 Jan 2010-01 08:15 17:40
## # … with 4,738 more rows, and 2 more variables: doy <dbl>, year <dbl>
# Plot of sunrise and sunset by day of year
dfSun %>%
select(date, month, year, doy, sr, ss) %>%
ggplot(aes(x=doy, group=factor(year), color=factor(year))) +
geom_line(aes(y=sr)) +
geom_line(aes(y=ss)) +
geom_line(aes(y=(ss+sr)/2)) +
labs(x="Day of year", y="Time (always on DST)", title="Sunrise, sunset, and solar noon by day of year") +
scale_color_discrete("Year")
# Plot of minutes gained from earliest/latest
dfSun %>%
select(date, month, year, doy, sr, ss) %>%
group_by(year) %>%
mutate(dsr=max(sr)-sr, dss=ss-min(ss)) %>%
ungroup() %>%
rename(sunrise_change=dsr, sunset_change=dss) %>%
pivot_longer(cols=c(sunrise_change, sunset_change)) %>%
ggplot(aes(x=doy)) +
geom_point(aes(y=as.numeric(value)/60, color=name), size=0.5) +
labs(x="Day of year", y="Minutes", title="Delta from latest sunrise / earliest sunset") +
scale_color_discrete("Metric")
Wind data is explored:
dfWind <- tmpOMDaily$tblDaily %>%
select(date,
dir=winddirection_10m_dominant,
spd=windspeed_10m_max,
gst=windgusts_10m_max
) %>%
mutate(month=lubridate::month(date),
year=lubridate::year(date),
dir10=round(dir/10)*10,
spd5=round(spd/5)*5,
gst5=round(gst/5)*5
)
dfWind
## # A tibble: 4,914 × 9
## date dir spd gst month year dir10 spd5 gst5
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 291 23.4 38.2 1 2010 290 25 40
## 2 2010-01-02 309 24.3 40.3 1 2010 310 25 40
## 3 2010-01-03 313 21.6 35.6 1 2010 310 20 35
## 4 2010-01-04 304 21 35.3 1 2010 300 20 35
## 5 2010-01-05 298 19.8 33.1 1 2010 300 20 35
## 6 2010-01-06 280 16.4 27 1 2010 280 15 25
## 7 2010-01-07 240 16.3 29.5 1 2010 240 15 30
## 8 2010-01-08 334 27.6 45 1 2010 330 30 45
## 9 2010-01-09 305 17.2 28.1 1 2010 300 15 30
## 10 2010-01-10 226 27.9 46.4 1 2010 230 30 45
## # … with 4,904 more rows
# Plot of wind direction and speed
dfWind %>%
ggplot(aes(x=dir, y=spd)) +
geom_point(alpha=0.2, size=0.5) +
coord_polar() +
facet_wrap(~factor(month.abb[month], levels=month.abb), nrow=2) +
geom_vline(xintercept=c(0, 90, 180, 270), lty=2, color="red") +
labs(title="Maximum wind speed and predominant direction (measured daily)",
y="Maximum Wind speed (km/h)",
x="Predominant Wind direction"
) +
scale_x_continuous(breaks=c(0, 90, 180, 270))
dfWind %>%
filter(lubridate::year(date)<=2022) %>%
count(month, dir10, spd5) %>%
ggplot(aes(x=dir10, y=spd5)) +
geom_point(aes(size=n), alpha=0.2) +
coord_polar() +
facet_wrap(~factor(month.abb[month], levels=month.abb)) +
geom_vline(xintercept=c(0, 90, 180, 270), lty=2, color="red") +
labs(title="Maximum wind speed and predominant direction (measured daily)",
subtitle="Wind speed rounded to nearest 5 km/h, wind direction rounded to nearest 10 degrees",
y="Maximum Wind speed (km/h)",
x="Predominant Wind direction"
) +
scale_x_continuous(breaks=c(0, 90, 180, 270))
# Plot of predominant wind direction
dfWind %>%
ggplot(aes(x=dir10)) +
geom_histogram(binwidth=10) +
facet_wrap(~factor(month.abb[month], levels=month.abb)) +
geom_vline(xintercept=c(0, 90, 180, 270, 360), lty=2, color="red") +
labs(title="Predominant wind direction (measured daily)",
y="# Days",
x="Predominant wind direction (rounded to nearest 10 degrees)"
) +
scale_x_continuous(breaks=c(0, 90, 180, 270, 360))
# Plot of maximum wind speed
dfWind %>%
ggplot(aes(x=spd5)) +
geom_histogram(binwidth=5) +
facet_wrap(~factor(month.abb[month], levels=month.abb)) +
labs(title="Maximum wind speed (measured daily)",
y="# Days",
x="Maximum wind speed (km/h, rounded to nearest 5 km/h)"
)
# Mean maximum wind speed by month
dfWind %>%
filter(year<=2022) %>%
select(date, month, year, spd, gst) %>%
pivot_longer(cols=-c(date, month, year)) %>%
ggplot(aes(x=factor(month.abb[month], levels=month.abb), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~c("gst"="2. Maximum wind gust", "spd"="1. Maximum wind speed")[name]) +
labs(title="Wind speed measured daily (2010-2022)", y="Wind speed (km/h)", x=NULL) +
theme(axis.text.x = element_text(angle = 90)) +
lims(y=c(0, NA))
Weather codes, radiation, and evapotranspiration are explored:
dfOther <- tmpOMDaily$tblDaily %>%
select(date, wc=weathercode, sw=shortwave_radiation_sum, et=et0_fao_evapotranspiration) %>%
mutate(wc=factor(wc, levels=sort(unique(wc))),
year=lubridate::year(date),
month=factor(month.abb[lubridate::month(date)], levels=month.abb),
yyyymm=customYYYYMM(date)
)
dfOther
## # A tibble: 4,914 × 7
## date wc sw et year month yyyymm
## <date> <fct> <dbl> <dbl> <dbl> <fct> <chr>
## 1 2010-01-01 3 6.94 0.53 2010 Jan 2010-01
## 2 2010-01-02 2 7.91 0.49 2010 Jan 2010-01
## 3 2010-01-03 3 5.62 0.46 2010 Jan 2010-01
## 4 2010-01-04 3 5.09 0.48 2010 Jan 2010-01
## 5 2010-01-05 3 6.61 0.52 2010 Jan 2010-01
## 6 2010-01-06 71 7.47 0.48 2010 Jan 2010-01
## 7 2010-01-07 73 3.82 0.29 2010 Jan 2010-01
## 8 2010-01-08 73 6.47 0.53 2010 Jan 2010-01
## 9 2010-01-09 3 6.22 0.38 2010 Jan 2010-01
## 10 2010-01-10 3 8.99 0.35 2010 Jan 2010-01
## # … with 4,904 more rows
# Histogram of weather code
dfOther %>%
filter(year<=2022) %>%
ggplot(aes(x=wc)) +
geom_bar() +
facet_wrap(~month) +
labs(title="Weather codes by month (2010-2022)", y="Count", x="Weather code") +
theme(axis.text.x = element_text(angle = 90))
# Mean radiation and evapotranspiration by month
dfOther %>%
select(-year) %>%
group_by(month) %>%
summarize(across(where(is.numeric), mean)) %>%
pivot_longer(cols=-c(month)) %>%
ggplot(aes(x=month)) +
geom_point(aes(y=value)) +
geom_line(aes(y=value, group=1)) +
labs(x=NULL,
y=NULL,
title="Mean radiation and evapotranspiration by month (2010-2022)",
subtitle="Evapotranspiration (mm) and Radiation (MegaJoules)"
) +
facet_wrap(~c("et"="Evapotranspiration (mm)", "sw"="Radiation (MJ)")[name], scales="free_y") +
lims(y=c(0, NA))
# Boxplot for radiation and evapotranspiration by month
dfOther %>%
select(date, month, sw, et) %>%
pivot_longer(-c(date, month)) %>%
ggplot(aes(x=month)) +
geom_boxplot(aes(y=value), fill="lightblue") +
labs(x=NULL,
y=NULL,
title="Daily radiation and evapotranspiration (2010-2022)",
subtitle="Evapotranspiration (mm) and Radiation (MegaJoules)"
) +
facet_wrap(~c("et"="Evapotranspiration (mm)", "sw"="Radiation (MJ)")[name], scales="free_y") +
lims(y=c(0, NA))
The hourly data is tested for file download, cached to avoid multiple hits to the server:
testURLHourly <- helperOpenMeteoURL(cityName="Chicago IL",
hourlyIndices=1:nrow(tblMetricsHourly),
startDate="2010-01-01",
endDate="2023-06-15",
tz="America/Chicago"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2010-01-01&end_date=2023-06-15&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=America%2FChicago"
# Download file
if(!file.exists("notuse_testOM_hourly.json")) {
fileDownload(fileName="notuse_testOM_hourly.json", url=testURLHourly)
} else {
cat("\nFile notuse_testOM_hourly.json already exists, skipping download\n")
}
## size isdir mode mtime
## notuse_testOM_hourly.json 20178300 FALSE 666 2023-06-30 08:03:13
## ctime atime exe
## notuse_testOM_hourly.json 2023-06-30 08:02:50 2023-06-30 08:03:13 no
Data are read and stored as a list:
tmpOMHourly <- readOpenMeteoJSON("notuse_testOM_hourly.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
tmpOMHourly
## $tblDaily
## NULL
##
## $tblHourly
## # A tibble: 117,936 × 37
## time date hour temper…¹ relat…² dewpo…³ appar…⁴ press…⁵
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 -9.5 67 -14.4 -15.8 1024.
## 2 2010-01-01 01:00:00 2010-01-01 1 -9.8 69 -14.4 -16.3 1025.
## 3 2010-01-01 02:00:00 2010-01-01 2 -10.3 73 -14.2 -16.8 1025.
## 4 2010-01-01 03:00:00 2010-01-01 3 -10.8 74 -14.5 -17.2 1026.
## 5 2010-01-01 04:00:00 2010-01-01 4 -11.3 75 -14.8 -17.7 1026.
## 6 2010-01-01 05:00:00 2010-01-01 5 -11.8 76 -15.1 -18.2 1026.
## 7 2010-01-01 06:00:00 2010-01-01 6 -12.3 77 -15.5 -18.6 1027.
## 8 2010-01-01 07:00:00 2010-01-01 7 -12.8 78 -15.8 -19 1028.
## 9 2010-01-01 08:00:00 2010-01-01 8 -13.2 79 -16.1 -19.4 1028.
## 10 2010-01-01 09:00:00 2010-01-01 9 -13.4 78 -16.3 -19.6 1028.
## # … with 117,926 more rows, 29 more variables: surface_pressure <dbl>,
## # precipitation <dbl>, rain <dbl>, snowfall <dbl>, cloudcover <int>,
## # cloudcover_low <int>, cloudcover_mid <int>, cloudcover_high <int>,
## # shortwave_radiation <dbl>, direct_radiation <dbl>,
## # direct_normal_irradiance <dbl>, diffuse_radiation <dbl>,
## # windspeed_10m <dbl>, windspeed_100m <dbl>, winddirection_10m <int>,
## # winddirection_100m <int>, windgusts_10m <dbl>, …
##
## $tblUnits
## # A tibble: 34 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above …
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters abov…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters …
## 5 hourly_units apparent_temperature deg C Apparent temperature is the percei…
## 6 hourly_units pressure_msl hPa Atmospheric air pressure reduced t…
## 7 hourly_units surface_pressure hPa Atmospheric air pressure reduced t…
## 8 hourly_units precipitation mm Total precipitation (rain, showers…
## 9 hourly_units rain mm Only liquid precipitation of the p…
## 10 hourly_units snowfall cm Snowfall amount of the preceding h…
## # … with 24 more rows
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seco…¹ timez…² timez…³ eleva…⁴
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 41.8 -87.7 6370. -18000 Americ… CDT 180
## # … with abbreviated variable names ¹utc_offset_seconds, ²timezone,
## # ³timezone_abbreviation, ⁴elevation
prettyOpenMeteoMeta(tmpOMHourly)
##
## latitude: 41.8
## longitude: -87.7
## generationtime_ms: 6369.988
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
Consistency of data between daily and hourly is explored:
# Variables where maximum of hourly should be created
vrblMax <- c("weathercode", "temperature_2m", "apparent_temperature", "windspeed_10m", "windgusts_10m")
# Variables where minimum of hourly should be created
vrblMin <- c("temperature_2m", "apparent_temperature")
# Variables where sum of hourly should be created
vrblSum <- c("precipitation", "rain", "snowfall", "shortwave_radiation", "et0_fao_evapotranspiration")
# Variables in daily not to explore
# date, time, sunrise, sunset
# Variables that require a different approach
# winddirection_10m_dominant, precipitation_hours
# Check that all variables are included in hourly data
c(vrblMax, vrblMin, vrblSum) %in% (tmpOMHourly$tblHourly %>% names)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Create daily data from hourly
dfDailyFromHourly <- tmpOMHourly$tblHourly %>%
group_by(date) %>%
summarize(across(.cols=all_of(vrblMax), .fns=max, .names="{.col}_max"),
across(.cols=all_of(vrblMin), .fns=min, .names="{.col}_min"),
across(.cols=all_of(vrblSum), .fns=sum, .names="{.col}_sum"),
precipitation_hours=sum(precipitation>0)
) %>%
rename(weathercode=weathercode_max, et0_fao_evapotranspiration=et0_fao_evapotranspiration_sum)
dfDailyFromHourly
## # A tibble: 4,914 × 14
## date weatherc…¹ tempe…² appar…³ winds…⁴ windg…⁵ tempe…⁶ appar…⁷ preci…⁸
## <date> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010-01-01 3 -8.6 -14.9 23.4 38.2 -13.4 -19.6 0
## 2 2010-01-02 2 -10.4 -17.5 24.3 40.3 -15.1 -21.7 0
## 3 2010-01-03 3 -7.9 -13.6 21.6 35.6 -13.8 -20.1 0
## 4 2010-01-04 3 -6.9 -12.8 21 35.3 -12.3 -18.9 0
## 5 2010-01-05 3 -4.8 -10.1 19.8 33.1 -9.8 -15.7 0
## 6 2010-01-06 71 -4.9 -9.2 16.4 27 -9 -14.4 0
## 7 2010-01-07 73 -5.2 -9.3 16.3 29.5 -8.5 -13 7.5
## 8 2010-01-08 73 -3 -9.2 27.6 45 -9.4 -15.3 2.3
## 9 2010-01-09 3 -5.8 -10.8 17.2 28.1 -12.3 -18.2 0
## 10 2010-01-10 3 -8.8 -16.2 27.9 46.4 -19.4 -25.6 0
## # … with 4,904 more rows, 5 more variables: rain_sum <dbl>, snowfall_sum <dbl>,
## # shortwave_radiation_sum <dbl>, et0_fao_evapotranspiration <dbl>,
## # precipitation_hours <int>, and abbreviated variable names ¹weathercode,
## # ²temperature_2m_max, ³apparent_temperature_max, ⁴windspeed_10m_max,
## # ⁵windgusts_10m_max, ⁶temperature_2m_min, ⁷apparent_temperature_min,
## # ⁸precipitation_sum
names(dfDailyFromHourly)
## [1] "date" "weathercode"
## [3] "temperature_2m_max" "apparent_temperature_max"
## [5] "windspeed_10m_max" "windgusts_10m_max"
## [7] "temperature_2m_min" "apparent_temperature_min"
## [9] "precipitation_sum" "rain_sum"
## [11] "snowfall_sum" "shortwave_radiation_sum"
## [13] "et0_fao_evapotranspiration" "precipitation_hours"
names(dfDailyFromHourly) %in% names(tmpOMDaily$tblDaily)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Check data consistency
for (colName in names(dfDailyFromHourly)) {
cat("\n",
colName,
":",
all.equal(dfDailyFromHourly %>% pull(colName), tmpOMDaily$tblDaily %>% pull(colName))
)
}
##
## date : TRUE
## weathercode : TRUE
## temperature_2m_max : TRUE
## apparent_temperature_max : TRUE
## windspeed_10m_max : TRUE
## windgusts_10m_max : TRUE
## temperature_2m_min : TRUE
## apparent_temperature_min : TRUE
## precipitation_sum : TRUE
## rain_sum : TRUE
## snowfall_sum : TRUE
## shortwave_radiation_sum : Mean relative difference: 0.9964
## et0_fao_evapotranspiration : TRUE
## precipitation_hours : TRUE
# Plot for differences in radiation
dfRadiation <- dfDailyFromHourly %>%
select(date, shortwave_radiation_sum) %>%
bind_rows(select(tmpOMDaily$tblDaily, date, shortwave_radiation_sum), .id="src") %>%
mutate(src=c("1"="Daily from Hourly", "2"="Daily as Reported")[src])
dfRadiation %>%
ggplot(aes(x=date, y=shortwave_radiation_sum)) +
geom_line(aes(group=src, color=src)) +
labs(x=NULL, y="Sum of radiation", title="Comparison of shortwave radiation by day by source")
# Exploration of units
tmpOMDaily$tblUnits %>% filter(name=="shortwave_radiation_sum")
## # A tibble: 1 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units shortwave_radiation_sum MJ/m² The sum of solar radiaion on a give…
tmpOMHourly$tblUnits %>% filter(name=="shortwave_radiation")
## # A tibble: 1 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units shortwave_radiation W/m² Shortwave solar radiation as average o…
# Conversion of Watts per hour to MegaJoules
# 0.0036 megajoules/watt-hour
dfRadiation %>%
ggplot(aes(x=date, y=ifelse(src=="Daily from Hourly", 0.0036, 1)*shortwave_radiation_sum)) +
geom_line(aes(group=src, color=src)) +
labs(x=NULL,
y="Sum of radiation",
title="Comparison of shortwave radiation by day by source",
subtitle="Summed from hourly multiplied by 0.0036 to convert Watt-hours to MegaJoules"
)
dfRadiation %>%
pivot_wider(id_cols="date", names_from="src", values_from="shortwave_radiation_sum") %>%
mutate(rat=`Daily as Reported`/`Daily from Hourly`) %>%
summary()
## date Daily from Hourly Daily as Reported rat
## Min. :2010-01-01 Min. : 135 Min. : 0.49 Min. :0.003585
## 1st Qu.:2013-05-13 1st Qu.:2304 1st Qu.: 8.29 1st Qu.:0.003599
## Median :2016-09-22 Median :4062 Median :14.62 Median :0.003600
## Mean :2016-09-22 Mean :4190 Mean :15.08 Mean :0.003600
## 3rd Qu.:2020-02-02 3rd Qu.:6056 3rd Qu.:21.80 3rd Qu.:0.003601
## Max. :2023-06-15 Max. :8788 Max. :31.64 Max. :0.003630
With the exception of radiation (reported in different units causing slight rounding differences), the reported daily data matches the expected aggregate of the reported hourly data
Precipitation by hour is explored:
# Hourly precipitation data
dfHourlyPrecip <- tmpOMHourly$tblHourly %>%
select(time, hour, precipitation, snowfall, rain) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyPrecip
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan precipitation 0
## 2 2010-01-01 00:00:00 0 2010 Jan snowfall 0
## 3 2010-01-01 00:00:00 0 2010 Jan rain 0
## 4 2010-01-01 01:00:00 1 2010 Jan precipitation 0
## 5 2010-01-01 01:00:00 1 2010 Jan snowfall 0
## 6 2010-01-01 01:00:00 1 2010 Jan rain 0
## 7 2010-01-01 02:00:00 2 2010 Jan precipitation 0
## 8 2010-01-01 02:00:00 2 2010 Jan snowfall 0
## 9 2010-01-01 02:00:00 2 2010 Jan rain 0
## 10 2010-01-01 03:00:00 3 2010 Jan precipitation 0
## # … with 353,798 more rows
# Nil precipitation percent
dfNilPrecip <- dfHourlyPrecip %>%
group_by(month, name) %>%
summarize(pctNil=mean(value==0), .groups="drop")
dfNilPrecip
## # A tibble: 36 × 3
## month name pctNil
## <fct> <chr> <dbl>
## 1 Jan precipitation 0.845
## 2 Jan rain 0.925
## 3 Jan snowfall 0.877
## 4 Feb precipitation 0.845
## 5 Feb rain 0.938
## 6 Feb snowfall 0.865
## 7 Mar precipitation 0.851
## 8 Mar rain 0.884
## 9 Mar snowfall 0.946
## 10 Apr precipitation 0.807
## # … with 26 more rows
# Graphs of precipitation amount by month
for(metric in unique(dfHourlyPrecip$name)) {
p1 <- dfHourlyPrecip %>%
filter(name==metric, value>0, year<=2022) %>%
ggplot() +
geom_histogram(aes(x=value), bins = 50) +
facet_wrap(~month) +
labs(x=NULL,
y=NULL,
title=paste0(metric,
": hourly total (",
tmpOMHourly$tblUnits %>% filter(name==metric) %>% pull(value),
") from 2010-2022"
)
) +
geom_text(data=dfNilPrecip %>% filter(name==metric),
aes(x=Inf,
y=Inf,
label=paste0("Excludes ", round(100*pctNil, 1), "%\nof observations at 0")
),
size=2.5,
hjust=1,
vjust=1
)
print(p1)
}
Temperature by hour is explored:
# Hourly temperature data
dfHourlyTemp <- tmpOMHourly$tblHourly %>%
select(time, hour, temperature_2m, apparent_temperature, dewpoint_2m) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyTemp
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan temperature_2m -9.5
## 2 2010-01-01 00:00:00 0 2010 Jan apparent_temperature -15.8
## 3 2010-01-01 00:00:00 0 2010 Jan dewpoint_2m -14.4
## 4 2010-01-01 01:00:00 1 2010 Jan temperature_2m -9.8
## 5 2010-01-01 01:00:00 1 2010 Jan apparent_temperature -16.3
## 6 2010-01-01 01:00:00 1 2010 Jan dewpoint_2m -14.4
## 7 2010-01-01 02:00:00 2 2010 Jan temperature_2m -10.3
## 8 2010-01-01 02:00:00 2 2010 Jan apparent_temperature -16.8
## 9 2010-01-01 02:00:00 2 2010 Jan dewpoint_2m -14.2
## 10 2010-01-01 03:00:00 3 2010 Jan temperature_2m -10.8
## # … with 353,798 more rows
# Graphs of precipitation amount by month
for(metric in unique(dfHourlyTemp$name)) {
p1 <- dfHourlyTemp %>%
filter(name==metric, year<=2022) %>%
ggplot() +
geom_boxplot(aes(x=factor(hour), y=value), fill = "lightblue") +
facet_wrap(~month) +
labs(x=NULL,
y=NULL,
title=paste0(metric,
": hourly boxplot (",
tmpOMHourly$tblUnits %>% filter(name==metric) %>% pull(value),
") from 2010-2022"
)
)
print(p1)
}
# Spread of temperature by day
dfHourlyTemp %>%
mutate(date=lubridate::date(time)) %>%
group_by(year, month, date, name) %>%
summarize(maxValue=max(value), minValue=min(value), mdnValue=median(value), .groups="drop") %>%
mutate(spd=maxValue-minValue) %>%
group_by(month, name) %>%
summarize(across(where(is.numeric), mean), .groups="drop") %>%
ggplot(aes(x=fct_rev(month), y=spd)) +
geom_point() +
coord_flip() +
facet_wrap(~name) +
labs(title="Average high/low spread of key metrics by month (deg C)", x=NULL, y="deg C") +
lims(y=c(0, NA))
Hours with maximum/minimum temperature and precipitation are explored:
# Create temperature and precipitation data
dfHourlyTempPrecip <- dfHourlyTemp %>%
bind_rows(dfHourlyPrecip) %>%
mutate(date=lubridate::date(time)) %>%
arrange(time, name)
dfHourlyTempPrecip
## # A tibble: 707,616 × 7
## time hour year month name value date
## <dttm> <int> <dbl> <fct> <chr> <dbl> <date>
## 1 2010-01-01 00:00:00 0 2010 Jan apparent_temperature -15.8 2010-01-01
## 2 2010-01-01 00:00:00 0 2010 Jan dewpoint_2m -14.4 2010-01-01
## 3 2010-01-01 00:00:00 0 2010 Jan precipitation 0 2010-01-01
## 4 2010-01-01 00:00:00 0 2010 Jan rain 0 2010-01-01
## 5 2010-01-01 00:00:00 0 2010 Jan snowfall 0 2010-01-01
## 6 2010-01-01 00:00:00 0 2010 Jan temperature_2m -9.5 2010-01-01
## 7 2010-01-01 01:00:00 1 2010 Jan apparent_temperature -16.3 2010-01-01
## 8 2010-01-01 01:00:00 1 2010 Jan dewpoint_2m -14.4 2010-01-01
## 9 2010-01-01 01:00:00 1 2010 Jan precipitation 0 2010-01-01
## 10 2010-01-01 01:00:00 1 2010 Jan rain 0 2010-01-01
## # … with 707,606 more rows
# Limit to temperature, dewpoint, and precipitation
# Limit precipitation to only days with precipitation > 0
tmpDF <- dfHourlyTempPrecip %>%
filter(name %in% c("dewpoint_2m", "precipitation", "temperature_2m")) %>%
group_by(date, name) %>%
filter(name!="precipitation" | sum(value)>0) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDF
## # A tibble: 1,728 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 dewpoint_2m Jan 0 isMax 0.212
## 2 dewpoint_2m Jan 0 isMin 0.196
## 3 dewpoint_2m Jan 1 isMax 0.0507
## 4 dewpoint_2m Jan 1 isMin 0.0968
## 5 dewpoint_2m Jan 2 isMax 0.0599
## 6 dewpoint_2m Jan 2 isMin 0.0484
## 7 dewpoint_2m Jan 3 isMax 0.0346
## 8 dewpoint_2m Jan 3 isMin 0.0253
## 9 dewpoint_2m Jan 4 isMax 0.0184
## 10 dewpoint_2m Jan 4 isMin 0.0507
## # … with 1,718 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDF$name)) {
p1 <- tmpDF %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value",
ifelse(keyMetric=="precipitation", " (days with no precipitation excluded)", "")
)
) +
scale_color_discrete("Metric:")
print(p1)
}
# Plot percent of hours with precipitation
dfHourlyTempPrecip %>%
filter(name=="precipitation") %>%
group_by(month, hour) %>%
summarize(pct0=mean(value>0), pct05=mean(value>=0.5), .groups="drop") %>%
pivot_longer(-c(month, hour)) %>%
mutate(name=ifelse(name=="pct0", ">=0.1 mm", ">=0.5 mm")) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(group=name, color=name)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% at/above precipitation hurdle",
title=paste0("% of observations with precipitation in past hour")
) +
lims(y=c(0, NA))
Wind by hour is explored:
# Hourly wind data
dfHourlyWind <- tmpOMHourly$tblHourly %>%
select(time, hour, windspeed_10m, windgusts_10m, winddirection_10m) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyWind
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan windspeed_10m 18.7
## 2 2010-01-01 00:00:00 0 2010 Jan windgusts_10m 33.8
## 3 2010-01-01 00:00:00 0 2010 Jan winddirection_10m 298
## 4 2010-01-01 01:00:00 1 2010 Jan windspeed_10m 20.1
## 5 2010-01-01 01:00:00 1 2010 Jan windgusts_10m 32.4
## 6 2010-01-01 01:00:00 1 2010 Jan winddirection_10m 291
## 7 2010-01-01 02:00:00 2 2010 Jan windspeed_10m 19.9
## 8 2010-01-01 02:00:00 2 2010 Jan windgusts_10m 34.2
## 9 2010-01-01 02:00:00 2 2010 Jan winddirection_10m 290
## 10 2010-01-01 03:00:00 3 2010 Jan windspeed_10m 19.5
## # … with 353,798 more rows
# Graphs of wind speed/gust by month
for(metric in setdiff(unique(dfHourlyWind$name), "winddirection_10m")) {
p1 <- dfHourlyWind %>%
filter(name==metric, year<=2022) %>%
ggplot() +
geom_boxplot(aes(x=factor(hour), y=value), fill = "lightblue") +
facet_wrap(~month) +
labs(x=NULL,
y=NULL,
title=paste0(metric,
": hourly boxplot (",
tmpOMHourly$tblUnits %>% filter(name==metric) %>% pull(value),
") from 2010-2022"
)
)
print(p1)
}
# Average wind speed and gust by hour
dfHourlyWind %>%
filter(!(name %in% c("winddirection_10m")), year<=2022) %>%
group_by(month, hour, name) %>%
summarize(across("value", mean), .groups="drop") %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_point(aes(group=name, color=name)) +
facet_wrap(~month) +
labs(title="Average wind speed and gust (km/h)", x=NULL, y="km/h") +
lims(y=c(0, NA))
# Average wind direction by hour
dfHourlyWind %>%
filter((name %in% c("winddirection_10m")), year<=2022) %>%
mutate(preDom=case_when(value<45|value>=315~"N",
value<135~"E",
value<225~"S",
value<315~"W",
TRUE~"error"
)
) %>%
count(month, hour, preDom) %>%
ggplot(aes(x=factor(hour), y=n)) +
geom_col(aes(fill=factor(preDom, levels=c("N", "W", "S", "E"))), position="fill") +
facet_wrap(~month) +
labs(title="Distribution of wind direction", x=NULL, y="Wind direction (%)") +
scale_fill_discrete("")
Wind by N/S and E/W is also explored:
# Average wind direction by hour
tmpWindDir <- dfHourlyWind %>%
filter((name %in% c("winddirection_10m")), year<=2022) %>%
mutate(ew=case_when(value>30&value<150~"E",
value>210&value<=330~"W",
TRUE~"none"
),
ns=case_when(value>300|value<=60~"N",
value>120&value<=240~"S",
TRUE~"none"
)
)
tmpWindDir
## # A tibble: 113,952 × 8
## time hour year month name value ew ns
## <dttm> <int> <dbl> <fct> <chr> <dbl> <chr> <chr>
## 1 2010-01-01 00:00:00 0 2010 Jan winddirection_10m 298 W none
## 2 2010-01-01 01:00:00 1 2010 Jan winddirection_10m 291 W none
## 3 2010-01-01 02:00:00 2 2010 Jan winddirection_10m 290 W none
## 4 2010-01-01 03:00:00 3 2010 Jan winddirection_10m 289 W none
## 5 2010-01-01 04:00:00 4 2010 Jan winddirection_10m 289 W none
## 6 2010-01-01 05:00:00 5 2010 Jan winddirection_10m 288 W none
## 7 2010-01-01 06:00:00 6 2010 Jan winddirection_10m 287 W none
## 8 2010-01-01 07:00:00 7 2010 Jan winddirection_10m 286 W none
## 9 2010-01-01 08:00:00 8 2010 Jan winddirection_10m 285 W none
## 10 2010-01-01 09:00:00 9 2010 Jan winddirection_10m 282 W none
## # … with 113,942 more rows
tmpWindDir %>%
count(month, hour, ew) %>%
group_by(month, hour) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=hour, y=pct)) +
geom_line(aes(color=factor(ew, levels=c("W", "none", "E")))) +
facet_wrap(~month) +
labs(title="Wind direction",
x="Hour of day",
y="% of observations",
subtitle="(030-150 deg defined as East, 210-330 deg defined as West)"
) +
scale_color_discrete("") +
lims(y=c(0, NA))
tmpWindDir %>%
count(month, hour, ns) %>%
group_by(month, hour) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=hour, y=pct)) +
geom_line(aes(color=factor(ns, levels=c("S", "none", "N")))) +
facet_wrap(~month) +
labs(title="Wind direction",
x="Hour of day",
y="% of observations",
subtitle="(300-060 deg defined as North, 120-240 deg defined as South)"
) +
scale_color_discrete("") +
lims(y=c(0, NA))
Hourly wind directions are averaged using arctan. The formula is arctan2(y=sum-of-sin, x=sum-of-cos):
# Unweighted by wind speed
tmpWindatan_uw <- tmpWindDir %>%
mutate(date=lubridate::date(time), cosine=cos(2*pi*value/360), sine=sin(2*pi*value/360)) %>%
group_by(date) %>%
summarize(across(c(cosine, sine), sum), .groups="drop") %>%
mutate(arctangent=atan(sine/cosine),
arctangent2=atan2(y=sine, x=cosine),
avgdir=((arctangent2/2)*(360/pi)),
avgdir=ifelse(avgdir<0, 360+avgdir, avgdir)
) %>%
left_join(select(tmpOMDaily$tblDaily, date, wdd=winddirection_10m_dominant), by="date")
tmpWindatan_uw
## # A tibble: 4,748 × 7
## date cosine sine arctangent arctangent2 avgdir wdd
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2010-01-01 8.00 -22.2 -1.22 -1.22 290. 291
## 2 2010-01-02 14.8 -18.2 -0.888 -0.888 309. 309
## 3 2010-01-03 16.2 -17.3 -0.817 -0.817 313. 313
## 4 2010-01-04 13.4 -19.8 -0.976 -0.976 304. 304
## 5 2010-01-05 10.9 -21.3 -1.10 -1.10 297. 298
## 6 2010-01-06 1.50 -22.4 -1.50 -1.50 274. 280
## 7 2010-01-07 -9.62 -6.35 0.583 -2.56 213. 240
## 8 2010-01-08 19.6 -11.1 -0.516 -0.516 330. 334
## 9 2010-01-09 13.5 -19.3 -0.962 -0.962 305. 305
## 10 2010-01-10 -13.7 -17.3 0.900 -2.24 232. 226
## # … with 4,738 more rows
tmpWindatan_uw %>%
mutate(delta=avgdir-wdd) %>%
summary()
## date cosine sine arctangent
## Min. :2010-01-01 Min. :-23.894 Min. :-23.930 Min. :-1.5699
## 1st Qu.:2013-04-01 1st Qu.:-14.778 1st Qu.:-14.142 1st Qu.:-0.5057
## Median :2016-07-01 Median : -2.812 Median : -3.161 Median : 0.2924
## Mean :2016-07-01 Mean : -1.955 Mean : -2.518 Mean : 0.1595
## 3rd Qu.:2019-10-01 3rd Qu.: 10.799 3rd Qu.: 8.453 3rd Qu.: 0.8423
## Max. :2022-12-31 Max. : 23.900 Max. : 23.627 Max. : 1.5704
## arctangent2 avgdir wdd delta
## Min. :-3.1408 Min. : 0.1209 Min. : 0.0 Min. :-356.6866
## 1st Qu.:-2.1208 1st Qu.: 92.7324 1st Qu.: 93.0 1st Qu.: -2.5483
## Median :-0.6638 Median :196.5612 Median :198.5 Median : -0.0048
## Mean :-0.4307 Mean :180.2702 Mean :181.2 Mean : -0.9128
## 3rd Qu.: 0.9929 3rd Qu.:255.9577 3rd Qu.:257.0 3rd Qu.: 2.5971
## Max. : 3.1409 Max. :359.9807 Max. :360.0 Max. : 358.5385
tmpWindatan_uw %>%
count(wdd, awdd=round(avgdir)) %>%
ggplot(aes(x=wdd, y=awdd)) +
geom_point(aes(size=n)) +
labs(x="Reported dominant wind direction (daily data)",
y="Calculated dominant wind direction (hourly data)",
title="Relationship between reported and calculated dominant wind direction",
subtitle="Unweighted by wind speed"
) +
scale_size_continuous("# days")
# Weighted by wind speed
tmpWindatan_wtd <- tmpOMHourly$tblHourly %>%
select(date, time, wd=winddirection_10m, ws=windspeed_10m) %>%
mutate(cosine=cos(2*pi*wd/360), sine=sin(2*pi*wd/360)) %>%
group_by(date) %>%
summarize(across(c(cosine, sine), .fns=function(x) sum(x*ws)/sum(ws)), sws=sum(ws), .groups="drop") %>%
mutate(arctangent=atan(sine/cosine),
arctangent2=atan2(y=sine, x=cosine),
avgdir=((arctangent2/2)*(360/pi)),
avgdir=ifelse(avgdir<0, 360+avgdir, avgdir),
avgspd=sws/24,
dist=sws*sqrt(cosine**2+sine**2)
) %>%
left_join(select(tmpOMDaily$tblDaily, date, wdd=winddirection_10m_dominant), by="date")
tmpWindatan_wtd
## # A tibble: 4,914 × 10
## date cosine sine sws arctangent arctang…¹ avgdir avgspd dist wdd
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2010-01-01 0.345 -0.917 463 -1.21 -1.21 291. 19.3 454. 291
## 2 2010-01-02 0.620 -0.757 478. -0.885 -0.885 309. 19.9 468. 309
## 3 2010-01-03 0.675 -0.722 447. -0.819 -0.819 313. 18.6 441. 313
## 4 2010-01-04 0.561 -0.823 461. -0.973 -0.973 304. 19.2 459. 304
## 5 2010-01-05 0.461 -0.884 392. -1.09 -1.09 298. 16.3 391. 298
## 6 2010-01-06 0.169 -0.945 251. -1.39 -1.39 280. 10.5 241. 280
## 7 2010-01-07 -0.250 -0.426 210 1.04 -2.10 240. 8.75 104. 240
## 8 2010-01-08 0.857 -0.418 471. -0.453 -0.453 334. 19.6 449. 334
## 9 2010-01-09 0.569 -0.798 359. -0.951 -0.951 306. 14.9 351. 305
## 10 2010-01-10 -0.643 -0.672 457. 0.808 -2.33 226. 19.0 425. 226
## # … with 4,904 more rows, and abbreviated variable name ¹arctangent2
tmpWindatan_wtd %>%
mutate(delta=avgdir-wdd) %>%
summary()
## date cosine sine sws
## Min. :2010-01-01 Min. :-0.99564 Min. :-0.9971 Min. : 85.3
## 1st Qu.:2013-05-13 1st Qu.:-0.64915 1st Qu.:-0.6081 1st Qu.:247.4
## Median :2016-09-22 Median :-0.11700 Median :-0.1535 Median :336.6
## Mean :2016-09-22 Mean :-0.07922 Mean :-0.1086 Mean :354.5
## 3rd Qu.:2020-02-02 3rd Qu.: 0.47777 3rd Qu.: 0.3654 3rd Qu.:436.6
## Max. :2023-06-15 Max. : 0.99606 Max. : 0.9855 Max. :933.4
## arctangent arctangent2 avgdir avgspd
## Min. :-1.5707 Min. :-3.1403 Min. : 0.015 Min. : 3.554
## 1st Qu.:-0.4878 1st Qu.:-2.1529 1st Qu.: 91.028 1st Qu.:10.309
## Median : 0.3097 Median :-0.7181 Median :198.011 Median :14.023
## Mean : 0.1642 Mean :-0.4630 Mean :180.505 Mean :14.770
## 3rd Qu.: 0.8461 3rd Qu.: 0.9509 3rd Qu.:257.184 3rd Qu.:18.191
## Max. : 1.5705 Max. : 3.1402 Max. :359.957 Max. :38.892
## dist wdd delta
## Min. : 3.001 Min. : 0.0 Min. :-359.9850
## 1st Qu.:186.211 1st Qu.: 91.0 1st Qu.: -0.2522
## Median :285.688 Median :198.0 Median : 0.0037
## Mean :301.524 Mean :180.7 Mean : -0.1459
## 3rd Qu.:399.462 3rd Qu.:257.0 3rd Qu.: 0.2536
## Max. :920.029 Max. :360.0 Max. : 1.5144
tmpWindatan_wtd %>%
count(wdd, awdd=round(avgdir)) %>%
ggplot(aes(x=wdd, y=awdd)) +
geom_point(aes(size=n)) +
labs(x="Reported dominant wind direction (daily data)",
y="Calculated dominant wind direction (hourly data)",
title="Relationship between reported and calculated dominant wind direction",
subtitle="Weighted by wind speed"
) +
scale_size_continuous("# days")
tmpWindatan_wtd %>%
count(rdist=round(dist), rspd=round(avgspd)) %>%
ggplot(aes(x=rspd, y=rdist/24)) +
geom_point(aes(size=n)) +
labs(title="Average wind speed (total and weighted by direction) by day",
x="Average wind speed per day",
y="Weighted average wind speed\n(total distance on average angle, divided by 24)"
) +
geom_abline(slope=1, intercept=0, lty=2) +
scale_size_continuous("# days")
tmpWindatan_wtd %>%
mutate(rdist=round(dist), rspd=round(avgspd)) %>%
ggplot(aes(x=rspd)) +
geom_boxplot(aes(y=dist/24/rspd, group=rspd), fill="lightblue") +
labs(title="Average wind speed (total and weighted by direction) by day",
x="Average wind speed per day",
y="Average weighted wind speed\n(as ratio of gross average)"
)
tmpWindatan_wtd %>%
filter(abs(avgdir-wdd)>1)
## # A tibble: 7 × 10
## date cosine sine sws arctangent arctang…¹ avgdir avgspd dist
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-12-02 0.174 0.0000455 288. 0.000262 0.000262 1.50e-2 12.0 50.0
## 2 2013-07-31 0.0206 0.0414 182. 1.11 1.11 6.35e+1 7.58 8.41
## 3 2016-04-22 0.918 0.00197 406. 0.00215 0.00215 1.23e-1 16.9 373.
## 4 2017-07-31 -0.0306 -0.0569 86.6 1.08 -2.06 2.42e+2 3.61 5.59
## 5 2019-07-17 0.0352 0.0545 139. 0.998 0.998 5.72e+1 5.80 9.03
## 6 2020-11-22 -0.108 0.0523 228. -0.452 2.69 1.54e+2 9.48 27.3
## 7 2023-03-01 0.0482 0.0180 251. 0.358 0.358 2.05e+1 10.5 12.9
## # … with 1 more variable: wdd <int>, and abbreviated variable name ¹arctangent2
The wind direction averaging of hourly data, weighted by wind speed, is consistent with the reported dominant wind direction in the daily data.
Weather codes are explored:
# Hourly weather codes, evapotranspiration, and shortwave
dfHourlyCode <- tmpOMHourly$tblHourly %>%
select(time, hour, wc=weathercode, et=et0_fao_evapotranspiration, sw=shortwave_radiation) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyCode
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan wc 2
## 2 2010-01-01 00:00:00 0 2010 Jan et 0.02
## 3 2010-01-01 00:00:00 0 2010 Jan sw 0
## 4 2010-01-01 01:00:00 1 2010 Jan wc 1
## 5 2010-01-01 01:00:00 1 2010 Jan et 0.01
## 6 2010-01-01 01:00:00 1 2010 Jan sw 0
## 7 2010-01-01 02:00:00 2 2010 Jan wc 0
## 8 2010-01-01 02:00:00 2 2010 Jan et 0.01
## 9 2010-01-01 02:00:00 2 2010 Jan sw 0
## 10 2010-01-01 03:00:00 3 2010 Jan wc 0
## # … with 353,798 more rows
# Exploration of weather codes overall
dfHourlyCode %>%
filter(name=="wc", year<=2022) %>%
count(value) %>%
ggplot(aes(x=fct_rev(factor(value)), y=n/1000)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(n/1000, 1)), hjust=0, size=3) +
labs(title="Weather codes in hourly data (2010-2022)", y="Count (000)", x="Weather Code") +
coord_flip()
# Exploration of weather codes by month
dfHourlyCode %>%
filter(name=="wc", year<=2022) %>%
count(month, value) %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=fct_rev(factor(value)), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=paste0(round(100*pct, 1), "%")), hjust=0, size=3) +
labs(title="Weather codes in hourly data (2010-2022)", y="Frequency", x="Weather Code") +
coord_flip() +
facet_wrap(~month)
# Exploration of weather codes by month
dfHourlyCode %>%
filter(name=="wc", year<=2022) %>%
mutate(wType=case_when(value<=3~"Dry",
value>=51 & value<=55~"Drizzle",
value>=61 & value<=65~"Rain",
value>=71 & value<=75~"Snow",
TRUE~"error"
)
) %>%
count(month, wType) %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=month, y=pct)) +
geom_col(aes(fill=factor(wType, levels=c("Dry", "Drizzle", "Rain", "Snow"))), position="stack") +
labs(title="Precipitation types in hourly data (2010-2022)", y="Frequency", x=NULL) +
coord_flip() +
scale_fill_discrete("")
# Weather codes from WMO Code Table 4677 (select examples)
# 00 Cloud development not observed or not observable
# 01 Clouds generally dissolving or becoming less developed
# 02 State of sky on the whole unchanged
# 03 Clouds generally forming or developing
# 51 Drizzle, not freezing, continuous (slight)
# 53 Drizzle, not freezing, continuous (moderate)
# 55 Drizzle, not freezing, continuous (heavy)
# 61 Rain, not freezing, continuous (slight)
# 63 Rain, not freezing, continuous (moderate)
# 65 Rain, not freezing, continuous (heavy)
# 71 Continuous fall of snowflakes (slight)
# 73 Continuous fall of snowflakes (moderate)
# 75 Continuous fall of snowflakes (heavy)
Shortwave radiation is explored:
dfHourlyCode %>%
filter(name=="sw", year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(title="Average shortwave solar radiation over the past hour", x=NULL, y="Watts per sqaure meter")
dfHourlyCode %>%
filter(name=="sw", year<=2022) %>%
group_by(hour, month) %>%
summarize(value=mean(value), .groups="drop") %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_point() +
facet_wrap(~month) +
labs(title="Average hourly shortwave solar radiation by hour and month (2010-2022)",
x=NULL,
y="Watts per sqaure meter"
)
dfHourlyCode %>%
filter(name %in% c("sw"), year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric") %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0("Shortwave radiation", ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
Evapotranspiration is explored:
dfHourlyCode %>%
filter(name=="et", year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(title="Evapotranspiration of a well-watered grass field", x="Hour of day", y="mm")
dfHourlyCode %>%
filter(name=="et", year<=2022) %>%
group_by(hour, month) %>%
summarize(value=mean(value), .groups="drop") %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_point() +
facet_wrap(~month) +
labs(title="Mean evapotranspiration of a well-watered grass field by hour and month (2010-2022)",
x="Hour of day",
y="mm"
)
dfHourlyCode %>%
filter(name %in% c("et"), year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric") %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="proportion of time as max/min",
title=paste0("Evapotranspiration", ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
Cloud cover is explored:
# Create cloud cover data
dfHourlyCloud <- tmpOMHourly$tblHourly %>%
select(time, hour, contains("cloud")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyCloud
## # A tibble: 471,744 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <int>
## 1 2010-01-01 00:00:00 0 2010 Jan cloudcover 62
## 2 2010-01-01 00:00:00 0 2010 Jan cloudcover_low 69
## 3 2010-01-01 00:00:00 0 2010 Jan cloudcover_mid 0
## 4 2010-01-01 00:00:00 0 2010 Jan cloudcover_high 0
## 5 2010-01-01 01:00:00 1 2010 Jan cloudcover 47
## 6 2010-01-01 01:00:00 1 2010 Jan cloudcover_low 52
## 7 2010-01-01 01:00:00 1 2010 Jan cloudcover_mid 0
## 8 2010-01-01 01:00:00 1 2010 Jan cloudcover_high 0
## 9 2010-01-01 02:00:00 2 2010 Jan cloudcover 20
## 10 2010-01-01 02:00:00 2 2010 Jan cloudcover_low 22
## # … with 471,734 more rows
# Boxplot for cloud cover types
for(keyMetric in unique(dfHourlyCloud$name)) {
p1 <- dfHourlyCloud %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y="% sky covered with cloud",
title=paste0(keyMetric, ": % sky covered with cloud")
)
print(p1)
}
# Create max/min for metric
tmpDFCloud <- dfHourlyCloud %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFCloud
## # A tibble: 2,304 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 cloudcover Jan 0 isMax 0.300
## 2 cloudcover Jan 0 isMin 0.208
## 3 cloudcover Jan 1 isMax 0.273
## 4 cloudcover Jan 1 isMin 0.161
## 5 cloudcover Jan 2 isMax 0.266
## 6 cloudcover Jan 2 isMin 0.132
## 7 cloudcover Jan 3 isMax 0.283
## 8 cloudcover Jan 3 isMin 0.141
## 9 cloudcover Jan 4 isMax 0.298
## 10 cloudcover Jan 4 isMin 0.141
## # … with 2,294 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFCloud$name)) {
p1 <- tmpDFCloud %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Atmospheric pressure is explored:
# Create pressure data
dfHourlyPressure <- tmpOMHourly$tblHourly %>%
select(time, hour, contains("pressure")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, year, month, hour))
dfHourlyPressure
## # A tibble: 353,808 × 6
## time hour year month name value
## <dttm> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 0 2010 Jan pressure_msl 1024.
## 2 2010-01-01 00:00:00 0 2010 Jan surface_pressure 1001.
## 3 2010-01-01 00:00:00 0 2010 Jan vapor_pressure_deficit 0.1
## 4 2010-01-01 01:00:00 1 2010 Jan pressure_msl 1025.
## 5 2010-01-01 01:00:00 1 2010 Jan surface_pressure 1001.
## 6 2010-01-01 01:00:00 1 2010 Jan vapor_pressure_deficit 0.09
## 7 2010-01-01 02:00:00 2 2010 Jan pressure_msl 1025.
## 8 2010-01-01 02:00:00 2 2010 Jan surface_pressure 1002.
## 9 2010-01-01 02:00:00 2 2010 Jan vapor_pressure_deficit 0.08
## 10 2010-01-01 03:00:00 3 2010 Jan pressure_msl 1026.
## # … with 353,798 more rows
# Boxplot for pressure types
for(keyMetric in unique(dfHourlyPressure$name)) {
tmpUnits <- tmpOMHourly$tblUnits %>% filter(name==keyMetric) %>% pull(value)
p1 <- dfHourlyPressure %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y=paste0(keyMetric, " (", tmpUnits, ")"),
title=paste0(keyMetric, ": ", tmpUnits)
)
print(p1)
}
dfHourlyPressure %>%
pivot_wider(id_cols=c(time, hour, year, month)) %>%
count(pressure_msl, surface_pressure) %>%
ggplot(aes(x=pressure_msl, y=surface_pressure)) +
geom_point(aes(size=n)) +
geom_smooth(aes(weight=n), method="lm") +
labs(title="Surface pressure vs. MSL", x="MSL", y="Surface Pressure")
## `geom_smooth()` using formula = 'y ~ x'
# Create max/min for metric
tmpDFPressure <- dfHourlyPressure %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFPressure
## # A tibble: 1,728 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 pressure_msl Jan 0 isMax 0.278
## 2 pressure_msl Jan 0 isMin 0.283
## 3 pressure_msl Jan 1 isMax 0.0298
## 4 pressure_msl Jan 1 isMin 0.0546
## 5 pressure_msl Jan 2 isMax 0.0149
## 6 pressure_msl Jan 2 isMin 0.0223
## 7 pressure_msl Jan 3 isMax 0.0248
## 8 pressure_msl Jan 3 isMin 0.0248
## 9 pressure_msl Jan 4 isMax 0.0174
## 10 pressure_msl Jan 4 isMin 0.0149
## # … with 1,718 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFPressure$name)) {
p1 <- tmpDFPressure %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Soil temperature is explored:
# Create soil temperature data
dfHourlySoilTemp <- tmpOMHourly$tblHourly %>%
select(time, date, hour, starts_with("soil_temp")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, date, year, month, hour))
dfHourlySoilTemp
## # A tibble: 471,744 × 7
## time date hour year month name value
## <dttm> <date> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_0_to… -1.5
## 2 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_7_to… -0.4
## 3 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_28_t… 2.4
## 4 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_temperature_100_… 9
## 5 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_0_to… -1.6
## 6 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_7_to… -0.4
## 7 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_28_t… 2.4
## 8 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_temperature_100_… 9
## 9 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_temperature_0_to… -1.8
## 10 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_temperature_7_to… -0.4
## # … with 471,734 more rows
# Boxplot for soil temperature
for(keyMetric in unique(dfHourlySoilTemp$name)) {
tmpUnits <- tmpOMHourly$tblUnits %>% filter(name==keyMetric) %>% pull(value)
p1 <- dfHourlySoilTemp %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y=paste0(keyMetric, " (", tmpUnits, ")"),
title=paste0(keyMetric, ": ", tmpUnits)
)
print(p1)
}
# Mean and standard deviation by month
dfHourlySoilTemp %>%
group_by(date, name) %>%
summarize(across(value, .fns=list(mu=mean, sigma=sd)), .groups="drop") %>%
mutate(doy=lubridate::yday(date)) %>%
group_by(doy, name) %>%
summarize(across(starts_with("value"), .fns=list(mu=mean)), .groups="drop") %>%
pivot_longer(cols=-c(doy, name), names_to="metric") %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=name, color=stringr::str_replace(name, "soil_temperature_", ""))) +
facet_wrap(~c("value_mu_mu"="Daily mean", "value_sigma_mu"="Mean daily standard deviation")[metric],
nrow=2,
scales="free_y"
) +
labs(x="Day of Year",
y="Degrees (C)",
title="Soil temperature mean and average daily standard deviation"
) +
scale_color_discrete("Soil depth")
# Create max/min for metric
tmpDFSoilTemp <- dfHourlySoilTemp %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFSoilTemp
## # A tibble: 2,304 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 soil_temperature_0_to_7cm Jan 0 isMax 0.238
## 2 soil_temperature_0_to_7cm Jan 0 isMin 0.208
## 3 soil_temperature_0_to_7cm Jan 1 isMax 0.102
## 4 soil_temperature_0_to_7cm Jan 1 isMin 0.161
## 5 soil_temperature_0_to_7cm Jan 2 isMax 0.0645
## 6 soil_temperature_0_to_7cm Jan 2 isMin 0.154
## 7 soil_temperature_0_to_7cm Jan 3 isMax 0.0670
## 8 soil_temperature_0_to_7cm Jan 3 isMin 0.154
## 9 soil_temperature_0_to_7cm Jan 4 isMax 0.0620
## 10 soil_temperature_0_to_7cm Jan 4 isMin 0.166
## # … with 2,294 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFSoilTemp$name)) {
p1 <- tmpDFSoilTemp %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Soil moisture is explored:
# Create soil moisture data
dfHourlySoilMoist <- tmpOMHourly$tblHourly %>%
select(time, date, hour, starts_with("soil_moist")) %>%
mutate(year=lubridate::year(time),
month=factor(month.abb[lubridate::month(time)], levels=month.abb)
) %>%
pivot_longer(cols=-c(time, date, year, month, hour))
dfHourlySoilMoist
## # A tibble: 471,744 × 7
## time date hour year month name value
## <dttm> <date> <int> <dbl> <fct> <chr> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_0_to_7cm 0.295
## 2 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_7_to_28… 0.3
## 3 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_28_to_1… 0.334
## 4 2010-01-01 00:00:00 2010-01-01 0 2010 Jan soil_moisture_100_to_… 0.31
## 5 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_0_to_7cm 0.295
## 6 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_7_to_28… 0.3
## 7 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_28_to_1… 0.334
## 8 2010-01-01 01:00:00 2010-01-01 1 2010 Jan soil_moisture_100_to_… 0.31
## 9 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_moisture_0_to_7cm 0.294
## 10 2010-01-01 02:00:00 2010-01-01 2 2010 Jan soil_moisture_7_to_28… 0.3
## # … with 471,734 more rows
# Boxplot for soil moisture
for(keyMetric in unique(dfHourlySoilMoist$name)) {
tmpUnits <- tmpOMHourly$tblUnits %>% filter(name==keyMetric) %>% pull(value)
p1 <- dfHourlySoilMoist %>%
filter(name==keyMetric, year<=2022) %>%
ggplot(aes(x=factor(hour), y=value)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(x="Hour of day",
y=paste0(keyMetric, " (", tmpUnits, ")"),
title=paste0(keyMetric, ": ", tmpUnits)
)
print(p1)
}
# Mean and standard deviation by month
dfHourlySoilMoist %>%
group_by(date, name) %>%
summarize(across(value, .fns=list(mu=mean, sigma=sd)), .groups="drop") %>%
mutate(doy=lubridate::yday(date)) %>%
group_by(doy, name) %>%
summarize(across(starts_with("value"), .fns=list(mu=mean)), .groups="drop") %>%
pivot_longer(cols=-c(doy, name), names_to="metric") %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=name, color=stringr::str_replace(name, "soil_moisture_", ""))) +
facet_wrap(~c("value_mu_mu"="Daily mean", "value_sigma_mu"="Mean daily standard deviation")[metric],
nrow=2,
scales="free_y"
) +
labs(x="Day of Year",
y="cubic meters per cubic meter\n(volumetric mixing ratio)",
title="Soil moisture mean and average daily standard deviation"
) +
scale_color_discrete("Soil depth")
# Create max/min for metric
tmpDFSoilMoist <- dfHourlySoilMoist %>%
filter(year<=2022) %>%
mutate(date=lubridate::date(time)) %>%
group_by(date, name) %>%
mutate(isMax=ifelse(value==max(value), 1, 0), isMin=ifelse(value==min(value), 1, 0)) %>%
group_by(name, month, hour) %>%
summarize(across(c(isMax, isMin), mean), .groups="drop") %>%
pivot_longer(-c(name, month, hour), names_to="metric")
tmpDFSoilMoist
## # A tibble: 2,304 × 5
## name month hour metric value
## <chr> <fct> <int> <chr> <dbl>
## 1 soil_moisture_0_to_7cm Jan 0 isMax 0.737
## 2 soil_moisture_0_to_7cm Jan 0 isMin 0.0844
## 3 soil_moisture_0_to_7cm Jan 1 isMax 0.529
## 4 soil_moisture_0_to_7cm Jan 1 isMin 0.0744
## 5 soil_moisture_0_to_7cm Jan 2 isMax 0.434
## 6 soil_moisture_0_to_7cm Jan 2 isMin 0.0794
## 7 soil_moisture_0_to_7cm Jan 3 isMax 0.375
## 8 soil_moisture_0_to_7cm Jan 3 isMin 0.0893
## 9 soil_moisture_0_to_7cm Jan 4 isMax 0.310
## 10 soil_moisture_0_to_7cm Jan 4 isMin 0.0918
## # … with 2,294 more rows
# Plot max/min for metric
for(keyMetric in unique(tmpDFSoilMoist$name)) {
p1 <- tmpDFSoilMoist %>%
filter(name==keyMetric) %>%
ggplot(aes(x=hour, y=value)) +
geom_line(aes(color=metric, group=metric)) +
facet_wrap(~month) +
labs(x="Hour of day",
y="% of time as max/min",
title=paste0(keyMetric, ": maximum and minimum by hour"),
subtitle=paste0("Ties included as full value")
) +
scale_color_discrete("Metric:")
print(p1)
}
Metrics are explored for their variation over months and over hours of day:
# Sample database
tmpTemp <- tmpOMHourly$tblHourly %>%
select(time, date, temperature_2m) %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
rndTemp=round(2*temperature_2m, 0)/2
)
tmpTemp
## # A tibble: 117,936 × 7
## time date temperature_2m month hour fct_hour rndTemp
## <dttm> <date> <dbl> <fct> <int> <fct> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 -9.5 Jan 0 0 -9.5
## 2 2010-01-01 01:00:00 2010-01-01 -9.8 Jan 1 1 -10
## 3 2010-01-01 02:00:00 2010-01-01 -10.3 Jan 2 2 -10.5
## 4 2010-01-01 03:00:00 2010-01-01 -10.8 Jan 3 3 -11
## 5 2010-01-01 04:00:00 2010-01-01 -11.3 Jan 4 4 -11.5
## 6 2010-01-01 05:00:00 2010-01-01 -11.8 Jan 5 5 -12
## 7 2010-01-01 06:00:00 2010-01-01 -12.3 Jan 6 6 -12.5
## 8 2010-01-01 07:00:00 2010-01-01 -12.8 Jan 7 7 -13
## 9 2010-01-01 08:00:00 2010-01-01 -13.2 Jan 8 8 -13
## 10 2010-01-01 09:00:00 2010-01-01 -13.4 Jan 9 9 -13.5
## # … with 117,926 more rows
# Simple predictive model for temperature/month
prdTemp <- tmpTemp %>%
count(rndTemp, month) %>%
arrange(rndTemp, desc(n)) %>%
group_by(rndTemp) %>%
mutate(corr=row_number()==1, pred=first(month)) %>%
ungroup()
prdTemp
## # A tibble: 828 × 5
## rndTemp month n corr pred
## <dbl> <fct> <int> <lgl> <fct>
## 1 -30 Jan 4 TRUE Jan
## 2 -29.5 Jan 6 TRUE Jan
## 3 -29 Jan 4 TRUE Jan
## 4 -28.5 Jan 6 TRUE Jan
## 5 -28 Jan 2 TRUE Jan
## 6 -27.5 Jan 1 TRUE Jan
## 7 -27 Jan 6 TRUE Jan
## 8 -26.5 Jan 4 TRUE Jan
## 9 -26 Jan 10 TRUE Jan
## 10 -25.5 Jan 15 TRUE Jan
## # … with 818 more rows
# Confusion matrix and accuracy
prdTemp %>%
count(month, corr, wt=n) %>%
pivot_wider(id_cols=month, names_from=corr, values_from=n, values_fill=0) %>%
bind_rows(summarize(., across(where(is.numeric), sum)) %>%
mutate(month="All") %>%
select(month, everything())
) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=ifelse(month=="All", 1/(nrow(.)-1), 2*n/sum(n)),
lift=pctCorrect/pctNaive
)
## # A tibble: 13 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <chr> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4215 6201 10416 0.595 0.0883 6.74
## 2 Feb 8404 1076 9480 0.114 0.0804 1.41
## 3 Mar 8817 1599 10416 0.154 0.0883 1.74
## 4 Apr 6972 3108 10080 0.308 0.0855 3.61
## 5 May 8584 1832 10416 0.176 0.0883 1.99
## 6 Jun 8535 1185 9720 0.122 0.0824 1.48
## 7 Jul 6524 3148 9672 0.325 0.0820 3.97
## 8 Aug 4335 5337 9672 0.552 0.0820 6.73
## 9 Sep 6888 2472 9360 0.264 0.0794 3.33
## 10 Oct 7248 2424 9672 0.251 0.0820 3.06
## 11 Nov 9360 0 9360 0 0.0794 0
## 12 Dec 7130 2542 9672 0.263 0.0820 3.20
## 13 All 87012 30924 117936 0.262 0.0833 3.15
prdTemp %>%
count(month, pred, corr, wt=n) %>%
ggplot(aes(x=month, y=pred)) +
labs(x="Actual month", y="Predicted month", title="Actual vs. predicted month using temperature") +
geom_text(aes(label=n)) +
geom_tile(aes(fill=corr), alpha=0.25)
The simple predictive model is converted to functional form:
simpleOneVarPredict <- function(df,
tgt,
prd,
nPrint=30,
showPlot=TRUE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements
# tgt: target variable
# prd: predictor variable
# nPrint: maximum number of lines of confusion matrix to print
# 0 means do not print any summary statistics
# showPlot: boolean, should overlap plot be created and shown?
# Counts of predictor to target variable
dfPred <- df %>%
group_by(across(all_of(c(prd, tgt)))) %>%
summarize(n=n(), .groups="drop") %>%
arrange(across(all_of(prd)), desc(n)) %>%
group_by(across(all_of(prd))) %>%
mutate(correct=row_number()==1, predicted=first(get(tgt))) %>%
ungroup()
# Confusion matrix and accuracy
dfConf <- dfPred %>%
group_by(across(all_of(c(tgt, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=1/(nrow(.)),
lift=pctCorrect/pctNaive-1
)
# Overall confusion matrix
dfConfAll <- dfConf %>%
summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
mutate(pctCorrect=`TRUE`/n,
pctNaive=nMax/n,
lift=pctCorrect/pctNaive-1,
nBucket=length(unique(dfPred[[prd]]))
)
# Print confusion matrices
if(nPrint > 0) {
cat("\nAccuracy by target subgroup:\n")
dfConf %>% print(n=nPrint)
cat("\nOverall Accuracy:\n")
dfConfAll %>% print(n=nPrint)
}
# Plot of overlaps
if(isTRUE(showPlot)) {
p1 <- dfPred %>%
group_by(across(c(all_of(tgt), "predicted", "correct"))) %>%
summarize(n=sum(n), .groups="drop") %>%
ggplot(aes(x=get(tgt), y=predicted)) +
labs(x="Actual",
y="Predicted",
title=paste0("Actual vs. predicted ", tgt),
subtitle=paste0("(using ", prd, ")")
) +
geom_text(aes(label=n)) +
geom_tile(aes(fill=correct), alpha=0.25)
print(p1)
}
# Return data if requested
if(isTRUE(returnData)) list(dfPred=dfPred, dfConf=dfConf, dfConfAll=dfConfAll)
}
tstFunc <- simpleOneVarPredict(tmpTemp, tgt="month", prd="rndTemp")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(tgt)
##
## # Now:
## data %>% select(all_of(tgt))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
##
## Accuracy by target subgroup:
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4215 6201 10416 0.595 0.0833 6.14
## 2 Feb 8404 1076 9480 0.114 0.0833 0.362
## 3 Mar 8817 1599 10416 0.154 0.0833 0.842
## 4 Apr 6972 3108 10080 0.308 0.0833 2.7
## 5 May 8584 1832 10416 0.176 0.0833 1.11
## 6 Jun 8535 1185 9720 0.122 0.0833 0.463
## 7 Jul 6524 3148 9672 0.325 0.0833 2.91
## 8 Aug 4335 5337 9672 0.552 0.0833 5.62
## 9 Sep 6888 2472 9360 0.264 0.0833 2.17
## 10 Oct 7248 2424 9672 0.251 0.0833 2.01
## 11 Nov 9360 0 9360 0 0.0833 -1
## 12 Dec 7130 2542 9672 0.263 0.0833 2.15
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 87012 30924 117936 0.262 0.0883 1.97 134
tstFunc
## $dfPred
## # A tibble: 828 × 5
## rndTemp month n correct predicted
## <dbl> <fct> <int> <lgl> <fct>
## 1 -30 Jan 4 TRUE Jan
## 2 -29.5 Jan 6 TRUE Jan
## 3 -29 Jan 4 TRUE Jan
## 4 -28.5 Jan 6 TRUE Jan
## 5 -28 Jan 2 TRUE Jan
## 6 -27.5 Jan 1 TRUE Jan
## 7 -27 Jan 6 TRUE Jan
## 8 -26.5 Jan 4 TRUE Jan
## 9 -26 Jan 10 TRUE Jan
## 10 -25.5 Jan 15 TRUE Jan
## # … with 818 more rows
##
## $dfConf
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 4215 6201 10416 0.595 0.0833 6.14
## 2 Feb 8404 1076 9480 0.114 0.0833 0.362
## 3 Mar 8817 1599 10416 0.154 0.0833 0.842
## 4 Apr 6972 3108 10080 0.308 0.0833 2.7
## 5 May 8584 1832 10416 0.176 0.0833 1.11
## 6 Jun 8535 1185 9720 0.122 0.0833 0.463
## 7 Jul 6524 3148 9672 0.325 0.0833 2.91
## 8 Aug 4335 5337 9672 0.552 0.0833 5.62
## 9 Sep 6888 2472 9360 0.264 0.0833 2.17
## 10 Oct 7248 2424 9672 0.251 0.0833 2.01
## 11 Nov 9360 0 9360 0 0.0833 -1
## 12 Dec 7130 2542 9672 0.263 0.0833 2.15
##
## $dfConfAll
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 87012 30924 117936 0.262 0.0883 1.97 134
all.equal(tstFunc$dfPred, rename(prdTemp, correct=corr, predicted=pred))
## [1] TRUE
The function is tested for predictive power of numeric variables, bucketed in to percentiles, on month:
# Create percentiles for numeric variables
tmpTemp <- tmpOMHourly$tblHourly %>%
mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
)
tmpTemp
## # A tibble: 117,936 × 73
## time date hour temper…¹ relat…² dewpo…³ appar…⁴ press…⁵
## <dttm> <date> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2010-01-01 00:00:00 2010-01-01 0 -9.5 67 -14.4 -15.8 1024.
## 2 2010-01-01 01:00:00 2010-01-01 1 -9.8 69 -14.4 -16.3 1025.
## 3 2010-01-01 02:00:00 2010-01-01 2 -10.3 73 -14.2 -16.8 1025.
## 4 2010-01-01 03:00:00 2010-01-01 3 -10.8 74 -14.5 -17.2 1026.
## 5 2010-01-01 04:00:00 2010-01-01 4 -11.3 75 -14.8 -17.7 1026.
## 6 2010-01-01 05:00:00 2010-01-01 5 -11.8 76 -15.1 -18.2 1026.
## 7 2010-01-01 06:00:00 2010-01-01 6 -12.3 77 -15.5 -18.6 1027.
## 8 2010-01-01 07:00:00 2010-01-01 7 -12.8 78 -15.8 -19 1028.
## 9 2010-01-01 08:00:00 2010-01-01 8 -13.2 79 -16.1 -19.4 1028.
## 10 2010-01-01 09:00:00 2010-01-01 9 -13.4 78 -16.3 -19.6 1028.
## # … with 117,926 more rows, 65 more variables: surface_pressure <dbl>,
## # precipitation <dbl>, rain <dbl>, snowfall <dbl>, cloudcover <int>,
## # cloudcover_low <int>, cloudcover_mid <int>, cloudcover_high <int>,
## # shortwave_radiation <dbl>, direct_radiation <dbl>,
## # direct_normal_irradiance <dbl>, diffuse_radiation <dbl>,
## # windspeed_10m <dbl>, windspeed_100m <dbl>, winddirection_10m <int>,
## # winddirection_100m <int>, windgusts_10m <dbl>, …
# Get key variable names
tmpNames <- tmpTemp %>%
select(starts_with("pct")) %>%
names()
tmpNames
## [1] "pct_hour" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_weathercode" "pct_vapor_pressure_deficit"
## [27] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [29] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [31] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [33] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
# Get the key predictive metrics
tmpDFR <- map_dfr(.x=tmpNames,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="month", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNames) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFR %>%
print(n=50)
## # A tibble: 34 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101 pct_soil_temp…
## 2 10416 64819 53117 117936 0.450 0.0883 4.10 101 pct_soil_temp…
## 3 10416 74369 43567 117936 0.369 0.0883 3.18 101 pct_soil_temp…
## 4 10416 81800 36136 117936 0.306 0.0883 2.47 66 pct_soil_mois…
## 5 10416 83423 34513 117936 0.293 0.0883 2.31 101 pct_soil_temp…
## 6 10416 86085 31851 117936 0.270 0.0883 2.06 96 pct_soil_mois…
## 7 10416 86997 30939 117936 0.262 0.0883 1.97 101 pct_temperatu…
## 8 10416 87078 30858 117936 0.262 0.0883 1.96 101 pct_apparent_…
## 9 10416 89320 28616 117936 0.243 0.0883 1.75 101 pct_dewpoint_…
## 10 10416 92788 25148 117936 0.213 0.0883 1.41 100 pct_soil_mois…
## 11 10416 94975 22961 117936 0.195 0.0883 1.20 101 pct_soil_mois…
## 12 10416 95744 22192 117936 0.188 0.0883 1.13 79 pct_vapor_pre…
## 13 10416 100072 17864 117936 0.151 0.0883 0.715 101 pct_pressure_…
## 14 10416 100766 17170 117936 0.146 0.0883 0.648 101 pct_surface_p…
## 15 10416 101246 16690 117936 0.142 0.0883 0.602 40 pct_et0_fao_e…
## 16 10416 102676 15260 117936 0.129 0.0883 0.465 101 pct_winddirec…
## 17 10416 102820 15116 117936 0.128 0.0883 0.451 101 pct_winddirec…
## 18 10416 102853 15083 117936 0.128 0.0883 0.448 64 pct_cloudcover
## 19 10416 102996 14940 117936 0.127 0.0883 0.434 46 pct_cloudcove…
## 20 10416 103110 14826 117936 0.126 0.0883 0.423 55 pct_diffuse_r…
## 21 10416 103169 14767 117936 0.125 0.0883 0.418 55 pct_shortwave…
## 22 10416 103186 14750 117936 0.125 0.0883 0.416 101 pct_windspeed…
## 23 10416 103197 14739 117936 0.125 0.0883 0.415 12 pct_weatherco…
## 24 10416 103299 14637 117936 0.124 0.0883 0.405 101 pct_windspeed…
## 25 10416 103594 14342 117936 0.122 0.0883 0.377 50 pct_direct_ra…
## 26 10416 103778 14158 117936 0.120 0.0883 0.359 50 pct_direct_no…
## 27 10416 103800 14136 117936 0.120 0.0883 0.357 50 pct_cloudcove…
## 28 10416 103861 14075 117936 0.119 0.0883 0.351 97 pct_windgusts…
## 29 10416 105069 12867 117936 0.109 0.0883 0.235 60 pct_relativeh…
## 30 10416 105133 12803 117936 0.109 0.0883 0.229 41 pct_cloudcove…
## 31 10416 106183 11753 117936 0.0997 0.0883 0.128 5 pct_snowfall
## 32 10416 106324 11612 117936 0.0985 0.0883 0.115 12 pct_rain
## 33 10416 106924 11012 117936 0.0934 0.0883 0.0572 13 pct_precipita…
## 34 10416 107520 10416 117936 0.0883 0.0883 0 24 pct_hour
tmpDFR %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(fill="lightblue") +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting month")
# Example for soil temperature and high clouds
simpleOneVarPredict(tmpTemp, tgt="month", prd="pct_soil_temperature_100_to_255cm", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5395 5021 10416 0.482 0.0833 4.78
## 2 Feb 5094 4386 9480 0.463 0.0833 4.55
## 3 Mar 3039 7377 10416 0.708 0.0833 7.50
## 4 Apr 8862 1218 10080 0.121 0.0833 0.45
## 5 May 7343 3073 10416 0.295 0.0833 2.54
## 6 Jun 5365 4355 9720 0.448 0.0833 4.38
## 7 Jul 4445 5227 9672 0.540 0.0833 5.49
## 8 Aug 4762 4910 9672 0.508 0.0833 5.09
## 9 Sep 2744 6616 9360 0.707 0.0833 7.48
## 10 Oct 6858 2814 9672 0.291 0.0833 2.49
## 11 Nov 5822 3538 9360 0.378 0.0833 3.54
## 12 Dec 5030 4642 9672 0.480 0.0833 4.76
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101
simpleOneVarPredict(tmpTemp, tgt="month", prd="pct_cloudcover_high", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 12 × 7
## month `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Jan 5425 4991 10416 0.479 0.0833 4.75
## 2 Feb 9480 0 9480 0 0.0833 -1
## 3 Mar 9807 609 10416 0.0585 0.0833 -0.298
## 4 Apr 9963 117 10080 0.0116 0.0833 -0.861
## 5 May 6551 3865 10416 0.371 0.0833 3.45
## 6 Jun 9345 375 9720 0.0386 0.0833 -0.537
## 7 Jul 8338 1334 9672 0.138 0.0833 0.655
## 8 Aug 8467 1205 9672 0.125 0.0833 0.495
## 9 Sep 9360 0 9360 0 0.0833 -1
## 10 Oct 9529 143 9672 0.0148 0.0833 -0.823
## 11 Nov 9360 0 9360 0 0.0833 -1
## 12 Dec 9508 164 9672 0.0170 0.0833 -0.797
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 10416 105133 12803 117936 0.109 0.0883 0.229 41
The function is tested for predictive power of numeric variables, bucketed in to percentiles, on hour:
# Remove hour from tmpNames
tmpNamesNoHour <- setdiff(tmpNames, "pct_hour")
tmpNamesNoHour
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_weathercode"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm"
# Get the key predictive metrics
tmpDFRHour <- map_dfr(.x=tmpNamesNoHour,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNamesNoHour) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFRHour %>%
print(n=50)
## # A tibble: 33 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 4914 102257 15679 117936 0.133 0.0417 2.19 55 pct_diffuse_r…
## 2 4914 102824 15112 117936 0.128 0.0417 2.08 55 pct_shortwave…
## 3 4914 105224 12712 117936 0.108 0.0417 1.59 50 pct_direct_ra…
## 4 4914 105893 12043 117936 0.102 0.0417 1.45 50 pct_direct_no…
## 5 4914 106137 11799 117936 0.100 0.0417 1.40 40 pct_et0_fao_e…
## 6 4914 109462 8474 117936 0.0719 0.0417 0.724 60 pct_relativeh…
## 7 4914 109962 7974 117936 0.0676 0.0417 0.623 79 pct_vapor_pre…
## 8 4914 110201 7735 117936 0.0656 0.0417 0.574 101 pct_soil_temp…
## 9 4914 110597 7339 117936 0.0622 0.0417 0.493 97 pct_windgusts…
## 10 4914 110778 7158 117936 0.0607 0.0417 0.457 101 pct_windspeed…
## 11 4914 110854 7082 117936 0.0600 0.0417 0.441 101 pct_temperatu…
## 12 4914 110941 6995 117936 0.0593 0.0417 0.423 101 pct_winddirec…
## 13 4914 110958 6978 117936 0.0592 0.0417 0.420 101 pct_windspeed…
## 14 4914 110983 6953 117936 0.0590 0.0417 0.415 101 pct_apparent_…
## 15 4914 111081 6855 117936 0.0581 0.0417 0.395 101 pct_winddirec…
## 16 4914 111522 6414 117936 0.0544 0.0417 0.305 101 pct_pressure_…
## 17 4914 111548 6388 117936 0.0542 0.0417 0.300 101 pct_surface_p…
## 18 4914 111600 6336 117936 0.0537 0.0417 0.289 64 pct_cloudcover
## 19 4914 111643 6293 117936 0.0534 0.0417 0.281 101 pct_dewpoint_…
## 20 4914 111667 6269 117936 0.0532 0.0417 0.276 101 pct_soil_temp…
## 21 4914 111831 6105 117936 0.0518 0.0417 0.242 46 pct_cloudcove…
## 22 4914 111846 6090 117936 0.0516 0.0417 0.239 50 pct_cloudcove…
## 23 4914 111908 6028 117936 0.0511 0.0417 0.227 101 pct_soil_mois…
## 24 4914 112038 5898 117936 0.0500 0.0417 0.200 100 pct_soil_mois…
## 25 4914 112197 5739 117936 0.0487 0.0417 0.168 12 pct_weatherco…
## 26 4914 112274 5662 117936 0.0480 0.0417 0.152 41 pct_cloudcove…
## 27 4914 112537 5399 117936 0.0458 0.0417 0.0987 96 pct_soil_mois…
## 28 4914 112566 5370 117936 0.0455 0.0417 0.0928 101 pct_soil_temp…
## 29 4914 112687 5249 117936 0.0445 0.0417 0.0682 12 pct_rain
## 30 4914 112691 5245 117936 0.0445 0.0417 0.0674 13 pct_precipita…
## 31 4914 112702 5234 117936 0.0444 0.0417 0.0651 101 pct_soil_temp…
## 32 4914 112803 5133 117936 0.0435 0.0417 0.0446 66 pct_soil_mois…
## 33 4914 112966 4970 117936 0.0421 0.0417 0.0114 5 pct_snowfall
tmpDFRHour %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(fill="lightblue") +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting hour")
# Example for diffuse radiation and soil moisture
simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd="pct_diffuse_radiation", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 24 × 7
## fct_hour `TRUE` `FALSE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 0 4914 0 4914 1 0.0417 23
## 2 1 0 4914 4914 0 0.0417 -1
## 3 2 0 4914 4914 0 0.0417 -1
## 4 3 0 4914 4914 0 0.0417 -1
## 5 4 0 4914 4914 0 0.0417 -1
## 6 5 0 4914 4914 0 0.0417 -1
## 7 6 554 4360 4914 0.113 0.0417 1.71
## 8 7 671 4243 4914 0.137 0.0417 2.28
## 9 8 735 4179 4914 0.150 0.0417 2.59
## 10 9 861 4053 4914 0.175 0.0417 3.21
## 11 10 763 4151 4914 0.155 0.0417 2.73
## 12 11 884 4030 4914 0.180 0.0417 3.32
## 13 12 614 4300 4914 0.125 0.0417 2.00
## 14 13 1130 3784 4914 0.230 0.0417 4.52
## 15 14 1395 3519 4914 0.284 0.0417 5.81
## 16 15 676 4238 4914 0.138 0.0417 2.30
## 17 16 150 4764 4914 0.0305 0.0417 -0.267
## 18 17 0 4914 4914 0 0.0417 -1
## 19 18 324 4590 4914 0.0659 0.0417 0.582
## 20 19 427 4487 4914 0.0869 0.0417 1.09
## 21 20 742 4172 4914 0.151 0.0417 2.62
## 22 21 839 4075 4914 0.171 0.0417 3.10
## 23 22 0 4914 4914 0 0.0417 -1
## 24 23 0 4914 4914 0 0.0417 -1
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 4914 102257 15679 117936 0.133 0.0417 2.19 55
simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd="pct_soil_moisture_100_to_255cm", returnData=FALSE)
##
## Accuracy by target subgroup:
## # A tibble: 24 × 7
## fct_hour `FALSE` `TRUE` n pctCorrect pctNaive lift
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 0 4518 396 4914 0.0806 0.0417 0.934
## 2 1 4687 227 4914 0.0462 0.0417 0.109
## 3 2 4884 30 4914 0.00611 0.0417 -0.853
## 4 3 4673 241 4914 0.0490 0.0417 0.177
## 5 4 4739 175 4914 0.0356 0.0417 -0.145
## 6 5 4588 326 4914 0.0663 0.0417 0.592
## 7 6 4756 158 4914 0.0322 0.0417 -0.228
## 8 7 4683 231 4914 0.0470 0.0417 0.128
## 9 8 4620 294 4914 0.0598 0.0417 0.436
## 10 9 4619 295 4914 0.0600 0.0417 0.441
## 11 10 4616 298 4914 0.0606 0.0417 0.455
## 12 11 4766 148 4914 0.0301 0.0417 -0.277
## 13 12 4825 89 4914 0.0181 0.0417 -0.565
## 14 13 4539 375 4914 0.0763 0.0417 0.832
## 15 14 4882 32 4914 0.00651 0.0417 -0.844
## 16 15 4611 303 4914 0.0617 0.0417 0.480
## 17 16 4730 184 4914 0.0374 0.0417 -0.101
## 18 17 4825 89 4914 0.0181 0.0417 -0.565
## 19 18 4422 492 4914 0.100 0.0417 1.40
## 20 19 4774 140 4914 0.0285 0.0417 -0.316
## 21 20 4788 126 4914 0.0256 0.0417 -0.385
## 22 21 4533 381 4914 0.0775 0.0417 0.861
## 23 22 4865 49 4914 0.00997 0.0417 -0.761
## 24 23 4860 54 4914 0.0110 0.0417 -0.736
##
## Overall Accuracy:
## # A tibble: 1 × 8
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int>
## 1 4914 112803 5133 117936 0.0435 0.0417 0.0446 66
Random variables, split equally 0-5, 0-25, and 0-100, are included as an example null state:
# Add random variables
set.seed(23072413)
tmpTemp <- tmpTemp %>%
mutate(rnd005=sample(0:5, size=n(), replace=TRUE),
rnd025=sample(0:25, size=n(), replace=TRUE),
rnd100=sample(0:100, size=n(), replace=TRUE)
)
# Get key variable names
tmpNames_v2 <- tmpTemp %>%
select(starts_with("pct"), starts_with("rnd")) %>%
names()
tmpNames_v2
## [1] "pct_hour" "pct_temperature_2m"
## [3] "pct_relativehumidity_2m" "pct_dewpoint_2m"
## [5] "pct_apparent_temperature" "pct_pressure_msl"
## [7] "pct_surface_pressure" "pct_precipitation"
## [9] "pct_rain" "pct_snowfall"
## [11] "pct_cloudcover" "pct_cloudcover_low"
## [13] "pct_cloudcover_mid" "pct_cloudcover_high"
## [15] "pct_shortwave_radiation" "pct_direct_radiation"
## [17] "pct_direct_normal_irradiance" "pct_diffuse_radiation"
## [19] "pct_windspeed_10m" "pct_windspeed_100m"
## [21] "pct_winddirection_10m" "pct_winddirection_100m"
## [23] "pct_windgusts_10m" "pct_et0_fao_evapotranspiration"
## [25] "pct_weathercode" "pct_vapor_pressure_deficit"
## [27] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [29] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [31] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [33] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
## [35] "rnd005" "rnd025"
## [37] "rnd100"
# Get the key predictive metrics
tmpDFR_v2 <- map_dfr(.x=tmpNames_v2,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="month", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNames_v2) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFR_v2 %>%
print(n=50)
## # A tibble: 37 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 10416 64759 53177 117936 0.451 0.0883 4.11 101 pct_soil_temp…
## 2 10416 64819 53117 117936 0.450 0.0883 4.10 101 pct_soil_temp…
## 3 10416 74369 43567 117936 0.369 0.0883 3.18 101 pct_soil_temp…
## 4 10416 81800 36136 117936 0.306 0.0883 2.47 66 pct_soil_mois…
## 5 10416 83423 34513 117936 0.293 0.0883 2.31 101 pct_soil_temp…
## 6 10416 86085 31851 117936 0.270 0.0883 2.06 96 pct_soil_mois…
## 7 10416 86997 30939 117936 0.262 0.0883 1.97 101 pct_temperatu…
## 8 10416 87078 30858 117936 0.262 0.0883 1.96 101 pct_apparent_…
## 9 10416 89320 28616 117936 0.243 0.0883 1.75 101 pct_dewpoint_…
## 10 10416 92788 25148 117936 0.213 0.0883 1.41 100 pct_soil_mois…
## 11 10416 94975 22961 117936 0.195 0.0883 1.20 101 pct_soil_mois…
## 12 10416 95744 22192 117936 0.188 0.0883 1.13 79 pct_vapor_pre…
## 13 10416 100072 17864 117936 0.151 0.0883 0.715 101 pct_pressure_…
## 14 10416 100766 17170 117936 0.146 0.0883 0.648 101 pct_surface_p…
## 15 10416 101246 16690 117936 0.142 0.0883 0.602 40 pct_et0_fao_e…
## 16 10416 102676 15260 117936 0.129 0.0883 0.465 101 pct_winddirec…
## 17 10416 102820 15116 117936 0.128 0.0883 0.451 101 pct_winddirec…
## 18 10416 102853 15083 117936 0.128 0.0883 0.448 64 pct_cloudcover
## 19 10416 102996 14940 117936 0.127 0.0883 0.434 46 pct_cloudcove…
## 20 10416 103110 14826 117936 0.126 0.0883 0.423 55 pct_diffuse_r…
## 21 10416 103169 14767 117936 0.125 0.0883 0.418 55 pct_shortwave…
## 22 10416 103186 14750 117936 0.125 0.0883 0.416 101 pct_windspeed…
## 23 10416 103197 14739 117936 0.125 0.0883 0.415 12 pct_weatherco…
## 24 10416 103299 14637 117936 0.124 0.0883 0.405 101 pct_windspeed…
## 25 10416 103594 14342 117936 0.122 0.0883 0.377 50 pct_direct_ra…
## 26 10416 103778 14158 117936 0.120 0.0883 0.359 50 pct_direct_no…
## 27 10416 103800 14136 117936 0.120 0.0883 0.357 50 pct_cloudcove…
## 28 10416 103861 14075 117936 0.119 0.0883 0.351 97 pct_windgusts…
## 29 10416 105069 12867 117936 0.109 0.0883 0.235 60 pct_relativeh…
## 30 10416 105133 12803 117936 0.109 0.0883 0.229 41 pct_cloudcove…
## 31 10416 106183 11753 117936 0.0997 0.0883 0.128 5 pct_snowfall
## 32 10416 106299 11637 117936 0.0987 0.0883 0.117 101 rnd100
## 33 10416 106324 11612 117936 0.0985 0.0883 0.115 12 pct_rain
## 34 10416 106924 11012 117936 0.0934 0.0883 0.0572 13 pct_precipita…
## 35 10416 106943 10993 117936 0.0932 0.0883 0.0554 26 rnd025
## 36 10416 107317 10619 117936 0.0900 0.0883 0.0195 6 rnd005
## 37 10416 107520 10416 117936 0.0883 0.0883 0 24 pct_hour
tmpDFR_v2 %>%
mutate(fillColor=ifelse(str_detect(vrbl, pattern="pct_"), "lightblue", "red")) %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(aes(fill=fillColor)) +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting month") +
scale_fill_identity()
# Remove hour from tmpNames
tmpNamesNoHour_v2 <- setdiff(tmpNames_v2, "pct_hour")
tmpNamesNoHour_v2
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_weathercode"
## [25] "pct_vapor_pressure_deficit" "pct_soil_temperature_0_to_7cm"
## [27] "pct_soil_temperature_7_to_28cm" "pct_soil_temperature_28_to_100cm"
## [29] "pct_soil_temperature_100_to_255cm" "pct_soil_moisture_0_to_7cm"
## [31] "pct_soil_moisture_7_to_28cm" "pct_soil_moisture_28_to_100cm"
## [33] "pct_soil_moisture_100_to_255cm" "rnd005"
## [35] "rnd025" "rnd100"
# Get the key predictive metrics
tmpDFRHour_v2 <- map_dfr(.x=tmpNamesNoHour_v2,
.f=function(x) simpleOneVarPredict(tmpTemp, tgt="fct_hour", prd=x, nPrint=0, showPlot=FALSE)$dfConfAll
) %>%
mutate(vrbl=tmpNamesNoHour_v2) %>%
arrange(desc(lift))
# Print and plot lift by variable
tmpDFRHour_v2 %>%
print(n=50)
## # A tibble: 36 × 9
## nMax `FALSE` `TRUE` n pctCorrect pctNaive lift nBucket vrbl
## <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 4914 102257 15679 117936 0.133 0.0417 2.19 55 pct_diffuse_r…
## 2 4914 102824 15112 117936 0.128 0.0417 2.08 55 pct_shortwave…
## 3 4914 105224 12712 117936 0.108 0.0417 1.59 50 pct_direct_ra…
## 4 4914 105893 12043 117936 0.102 0.0417 1.45 50 pct_direct_no…
## 5 4914 106137 11799 117936 0.100 0.0417 1.40 40 pct_et0_fao_e…
## 6 4914 109462 8474 117936 0.0719 0.0417 0.724 60 pct_relativeh…
## 7 4914 109962 7974 117936 0.0676 0.0417 0.623 79 pct_vapor_pre…
## 8 4914 110201 7735 117936 0.0656 0.0417 0.574 101 pct_soil_temp…
## 9 4914 110597 7339 117936 0.0622 0.0417 0.493 97 pct_windgusts…
## 10 4914 110778 7158 117936 0.0607 0.0417 0.457 101 pct_windspeed…
## 11 4914 110854 7082 117936 0.0600 0.0417 0.441 101 pct_temperatu…
## 12 4914 110941 6995 117936 0.0593 0.0417 0.423 101 pct_winddirec…
## 13 4914 110958 6978 117936 0.0592 0.0417 0.420 101 pct_windspeed…
## 14 4914 110983 6953 117936 0.0590 0.0417 0.415 101 pct_apparent_…
## 15 4914 111081 6855 117936 0.0581 0.0417 0.395 101 pct_winddirec…
## 16 4914 111522 6414 117936 0.0544 0.0417 0.305 101 pct_pressure_…
## 17 4914 111548 6388 117936 0.0542 0.0417 0.300 101 pct_surface_p…
## 18 4914 111600 6336 117936 0.0537 0.0417 0.289 64 pct_cloudcover
## 19 4914 111643 6293 117936 0.0534 0.0417 0.281 101 pct_dewpoint_…
## 20 4914 111654 6282 117936 0.0533 0.0417 0.278 101 rnd100
## 21 4914 111667 6269 117936 0.0532 0.0417 0.276 101 pct_soil_temp…
## 22 4914 111831 6105 117936 0.0518 0.0417 0.242 46 pct_cloudcove…
## 23 4914 111846 6090 117936 0.0516 0.0417 0.239 50 pct_cloudcove…
## 24 4914 111908 6028 117936 0.0511 0.0417 0.227 101 pct_soil_mois…
## 25 4914 112038 5898 117936 0.0500 0.0417 0.200 100 pct_soil_mois…
## 26 4914 112197 5739 117936 0.0487 0.0417 0.168 12 pct_weatherco…
## 27 4914 112274 5662 117936 0.0480 0.0417 0.152 41 pct_cloudcove…
## 28 4914 112337 5599 117936 0.0475 0.0417 0.139 26 rnd025
## 29 4914 112537 5399 117936 0.0458 0.0417 0.0987 96 pct_soil_mois…
## 30 4914 112566 5370 117936 0.0455 0.0417 0.0928 101 pct_soil_temp…
## 31 4914 112687 5249 117936 0.0445 0.0417 0.0682 12 pct_rain
## 32 4914 112691 5245 117936 0.0445 0.0417 0.0674 13 pct_precipita…
## 33 4914 112702 5234 117936 0.0444 0.0417 0.0651 101 pct_soil_temp…
## 34 4914 112711 5225 117936 0.0443 0.0417 0.0633 6 rnd005
## 35 4914 112803 5133 117936 0.0435 0.0417 0.0446 66 pct_soil_mois…
## 36 4914 112966 4970 117936 0.0421 0.0417 0.0114 5 pct_snowfall
tmpDFRHour_v2 %>%
mutate(fillColor=ifelse(str_detect(vrbl, pattern="pct_"), "lightblue", "red")) %>%
ggplot(aes(x=fct_reorder(stringr::str_replace_all(vrbl, "pct_", ""), lift), y=lift)) +
geom_col(aes(fill=fillColor)) +
coord_flip() +
labs(x=NULL, y="lift", title="Lift by hourly variable percentile in predicting hour") +
scale_fill_identity()